Example #1
0
        public void TestTagBasedSearchCompRef()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const string dataSetPath   = @"D:\MassSpecFiles\CompRef";
            const string fastaFilePath = @"D:\MassSpecFiles\CompRef\ID_003278_4B4B3CB1.fasta";
            const string modsFilePath  = @"D:\MassSpecFiles\CompRef\Mods.txt";

            if (!Directory.Exists(dataSetPath))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, dataSetPath);
            }
            if (!File.Exists(modsFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, modsFilePath);
            }
            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fileEntries = Directory.GetFiles(dataSetPath);

            var dataset = (from fileName in fileEntries where fileName.EndsWith("pbf") select Path.GetFileNameWithoutExtension(fileName)).ToList();

            dataset.Sort();

            var fastaDb   = new FastaDatabase(fastaFilePath);
            var tolerance = new Tolerance(10);
            var aaSet     = new AminoAcidSet(modsFilePath);

            for (var i = 0; i < dataset.Count; i++)
            {
                var rawFile     = string.Format(@"{0}\{1}.pbf", dataSetPath, dataset[i]);
                var ms1File     = string.Format(@"{0}\{1}.ms1ft", dataSetPath, dataset[i]);
                var tagFilePath = MassSpecDataReaderFactory.ChangeExtension(rawFile, ".seqtag");

                var run = PbfLcMsRun.GetLcMsRun(rawFile);
                //const int minTagLength = 5;
                //var tagParser = new SequenceTagParser(tagFilePath, minTagLength, 100);

                Console.WriteLine("-----------------{0}--------------------", rawFile);

                TestTagBasedSearch(run, fastaDb, tolerance, aaSet);

                Console.WriteLine("-----------------------------------------------------------------------");
            }
        }
Example #2
0
        public void TestCountingPeptides()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            var sw = new System.Diagnostics.Stopwatch();

            sw.Start();

            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta";

            if (!File.Exists(dbFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile);
            }

//            const string dbFile = @"C:\cygwin\home\kims336\Data\QCShew\ID_003456_9B916A8B.fasta";
//            const string dbFile = @"H:\Research\DDAPlus\database\Yeast_SGD_withContam.fasta";
//            const string dbFile = @"H:\Research\CPTAC_Phospho\database\ID_004208_295531A4.fasta";
            var db        = new FastaDatabase(dbFile);
            var indexedDb = new IndexedDatabase(db);
            //var numPeptides = indexedDb.IntactSequenceAnnotationsAndOffsets(21, 300, 0).LongCount()*31;
            var peptides = indexedDb
                           .SequenceAnnotationsAndOffsetsWithNtermOrCtermCleavageNoLargerThan(
                100, 300, 1, 0);
            var numPeptides = 0;

            foreach (var peptide in peptides)
            {
                Console.WriteLine("{0}\t{1}", peptide.Annotation, peptide.Offset);
                numPeptides++;
            }

            //var numPeptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 150).LongCount();
            //var numPeptides =
            //    indexedDb.AnnotationsAndOffsets(7, 40, 2, 2, Enzyme.Trypsin).LongCount();


            //var numPeptides = indexedDb.AnnotationsAndOffsets(6, 40, 2, 2, Enzyme.Trypsin).LongCount();
            //var numPeptides = indexedDb.IntactSequenceAnnotationsAndOffsets(30, 250, 0).LongCount();
            //    .Select(annotationAndSequence => annotationAndSequence.Annotation.Length - 4)
            //    .Aggregate(0L, (current, length) => current + Math.Min(length - 29, 30));

            Console.WriteLine("NumPeptides: {0}", numPeptides);
            sw.Stop();

            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
        }
Example #3
0
        public void AddProteinLengths()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string databaseFilePath = @"H:\Research\IPRG2015\database\yeast6proteaprotein.fasta";

            if (!File.Exists(databaseFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath);
            }

            var database = new FastaDatabase(databaseFilePath);

            database.Read();

            const string resultPath = @"H:\Research\IPRG2015\AMT_Peptides_NA.tsv";

            if (!File.Exists(resultPath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultPath);
            }

            const string outputFilePath = @"H:\Research\IPRG2015\AMT_Peptides.tsv";

            using (var writer = new StreamWriter(outputFilePath))
            {
                foreach (var line in File.ReadLines(resultPath))
                {
                    var data = line.Split(null);
                    if (data.Length != 14)
                    {
                        continue;
                    }
                    var peptide = data[0];
                    if (peptide.Equals("Peptide"))
                    {
                        writer.WriteLine("Peptide\tProtein\tLength\t{0}", string.Join("\t", data.Skip(2)));
                        continue;
                    }
                    var protein = data[1];
                    var length  = database.GetProteinLength(protein);
                    writer.WriteLine("{0}\t{1}\t{2}\t{3}", peptide, protein, length, string.Join("\t", data.Skip(2)));
                }
            }
        }
Example #4
0
        public void TestSumParallel()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            //var array = Enumerable.Range(0, short.MaxValue).ToArray();
            var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\ID_002216_235ACCEA.fasta"));

            var db = new FastaDatabase(fastaFile.FullName);

            db.Read();
            //var indexedDb = new IndexedDatabase(db);
            //indexedDb.Read();
            //var peptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30);
            var charArray = db.Characters().Select(c => (int)c).ToList();

            // Test methods.
            var defaultSum  = SumAsParallel(charArray);
            var parallelSum = SumAsParallel(charArray);

            Console.WriteLine("Default sum {0}", defaultSum);
            Console.WriteLine("Parallel sum {0}", parallelSum);

            Assert.AreEqual(parallelSum, defaultSum);

            const int m  = 100;
            var       s1 = Stopwatch.StartNew();

            for (var i = 0; i < m; i++)
            {
                SumDefault(charArray);
            }
            s1.Stop();

            var s2 = Stopwatch.StartNew();

            for (var i = 0; i < m; i++)
            {
                SumAsParallel(charArray);
            }
            s2.Stop();

            Console.WriteLine("{0:F2} msec/sum, on average for default", s1.Elapsed.TotalMilliseconds / m);
            Console.WriteLine("{0:F2} msec/sum, on average for parallel", s2.Elapsed.TotalMilliseconds / m);
        }
Example #5
0
        private IEnumerable <AnnotationAndOffset> GetAnnotationsAndOffsets(FastaDatabase database)
        {
            var indexedDbTarget = new IndexedDatabase(database);
            IEnumerable <AnnotationAndOffset> annotationsAndOffsets;

            if (NumTolerableTermini == 0)
            {
                annotationsAndOffsets = indexedDbTarget.AnnotationsAndOffsetsNoEnzyme(MinSequenceLength, MaxSequenceLength);
            }
            else
            {
                annotationsAndOffsets = indexedDbTarget.AnnotationsAndOffsets(MinSequenceLength, MaxSequenceLength,
                                                                              NumTolerableTermini, 2, Enzyme);
            }

            return(annotationsAndOffsets);
        }
Example #6
0
        public void TestSequenceEnumerationParallel2()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            var sw = new System.Diagnostics.Stopwatch();

            var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\ID_002216_235ACCEA.fasta"));

            var db = new FastaDatabase(fastaFile.FullName);

            db.Read();

            var indexedDb = new IndexedDatabase(db);
            var arr       = db.Characters().ToArray();

            sw.Start();
            //var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30);
            //            var num = annotationsAndOffsets.AsParallel().LongCount(annotationsAndOffset => annotationsAndOffset.Annotation.IndexOf('W') >= 0);
            //var num = annotationsAndOffsets.LongCount(annotationsAndOffset => annotationsAndOffset.Annotation.IndexOf('W') >= 0);
            //var num = arr.AsParallel().Where(c => c == 'W').LongCount();
            var num = 0;
            var sum = 0L;

            //foreach (var c in arr)
            for (var a = 0; a < arr.Length; a++)
            {
                var c = arr[a];
                for (var i = 0; i < c * 10000; i++)
                {
                    sum += i;
                }
                //                Interlocked.Increment(ref num);
                if (++num == 1000)
                {
                    break;
                }
            }

            Console.WriteLine("NumPeptides: {0}", sum);
            sw.Stop();

            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
        }
Example #7
0
        public void TestSearchWithTagGeneration()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const string rawFilePath = @"D:\MassSpecFiles\training\raw\QC_Shew_Intact_26Sep14_Bane_C2Column3.pbf";

            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            var          run           = PbfLcMsRun.GetLcMsRun(rawFilePath);
            const string fastaFilePath = @"D:\MSPathFinder\Fasta\ID_002216_235ACCEA.fasta";

            //const string fastaFilePath = @"D:\MassSpecFiles\60k\ID_004973_9BA6912F.fasta";
            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb      = new FastaDatabase(fastaFilePath);
            var tolerance    = new Tolerance(10);
            var modsFilePath = @"D:\MSPathFinder\Fasta\Mods.txt";

            if (!File.Exists(modsFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, modsFilePath);
            }

            var aaSet = new AminoAcidSet(modsFilePath);

            //TestTagBasedSearch(run, fastaDb, tolerance, aaSet);
            var tagSearchEngine = new ScanBasedTagSearchEngine(run, new SequenceTagGenerator(run, new Tolerance(8)), new LcMsPeakMatrix(run), fastaDb, tolerance, aaSet);

            var matchedTags = tagSearchEngine.RunSearch(4672);

            foreach (var match in matchedTags)
            {
                Console.Write(match.Sequence);
                Console.WriteLine("\t{0}\t{1}\t{2}", match.TagMatch.StartIndex, match.TagMatch.EndIndex, match.TagMatch.Mass);
            }
        }
Example #8
0
        private void RunSearch(SortedSet <DatabaseSequenceSpectrumMatch>[] matches, FastaDatabase db, ISequenceFilter sequenceFilter, CancellationToken?cancellationToken = null, IProgress <ProgressData> progress = null)
        {
            var progData = new ProgressData(progress)
            {
                Status = "Searching for matches"
            };

            var  sw = new Stopwatch();
            long estimatedProteins;
            var  annotationsAndOffsets = GetAnnotationsAndOffsets(db, out estimatedProteins, cancellationToken);

            Console.WriteLine(@"Estimated proteins: " + estimatedProteins);

            var numProteins = 0;
            var lastUpdate  = DateTime.MinValue; // Force original update of 0%

            sw.Reset();
            sw.Start();
            var pfeOptions = new ParallelOptions
            {
                MaxDegreeOfParallelism = MaxNumThreads,
                CancellationToken      = cancellationToken ?? CancellationToken.None
            };

            var maxNumNTermCleavages = SearchMode == InternalCleavageType.NoInternalCleavage ? MaxNumNTermCleavages : 0;

            //foreach (var annotationAndOffset in annotationsAndOffsets)
            Parallel.ForEach(annotationsAndOffsets, pfeOptions, annotationAndOffset =>
            {
                if (cancellationToken != null && cancellationToken.Value.IsCancellationRequested)
                {
                    //return matches;
                    return;
                }

                SearchProgressReport(ref numProteins, ref lastUpdate, estimatedProteins, sw, progData);
                SearchForMatches(annotationAndOffset, sequenceFilter, matches, maxNumNTermCleavages, db.IsDecoy, cancellationToken);
            });

            Console.WriteLine(@"Collected candidate matches: {0}", GetNumberOfMatches(matches));

            progData.StatusInternal = string.Empty;
            progData.Report(100.0);
        }
Example #9
0
        public void TestCountingPeptides()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            var sw = new System.Diagnostics.Stopwatch();

            sw.Start();

            var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\Short.fasta"));

            var db        = new FastaDatabase(fastaFile.FullName);
            var indexedDb = new IndexedDatabase(db);
            //var numPeptides = indexedDb.IntactSequenceAnnotationsAndOffsets(21, 300, 0).LongCount()*31;
            var peptides = indexedDb
                           .SequenceAnnotationsAndOffsetsWithNtermOrCtermCleavageNoLargerThan(
                100, 300, 1, 0);
            var numPeptides = 0;

            foreach (var peptide in peptides)
            {
                if (numPeptides < 20)
                {
                    Console.WriteLine("{0}\t{1}", peptide.Annotation, peptide.Offset);
                }
                numPeptides++;
            }

            //var numPeptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 150).LongCount();
            //var numPeptides =
            //    indexedDb.AnnotationsAndOffsets(7, 40, 2, 2, Enzyme.Trypsin).LongCount();

            //var numPeptides = indexedDb.AnnotationsAndOffsets(6, 40, 2, 2, Enzyme.Trypsin).LongCount();
            //var numPeptides = indexedDb.IntactSequenceAnnotationsAndOffsets(30, 250, 0).LongCount();
            //    .Select(annotationAndSequence => annotationAndSequence.Annotation.Length - 4)
            //    .Aggregate(0L, (current, length) => current + Math.Min(length - 29, 30));

            Console.WriteLine("NumPeptides: {0}", numPeptides);
            sw.Stop();

            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
        }
Example #10
0
        public ProteoformSpectrumMatchContainer(FastaDatabase database, int[] ms2ScanVector, int maxModifications, int maxNumMatchesPerSpectrum, int minScore = 4)
        {
            Database = database;
            NumMatchesPerSpectrum = maxNumMatchesPerSpectrum;
            _scoreCutoff          = minScore;
            Ms2ScanVector         = ms2ScanVector;
            _ms2ScanToIndexMap    = new int[ms2ScanVector.Last() + 1];
            for (var i = 0; i < ms2ScanVector.Length; i++)
            {
                var scanNum = ms2ScanVector[i];
                _ms2ScanToIndexMap[scanNum] = i;
            }
            _matchedSet = new SortedSet <DatabaseSequenceSpectrumMatch> [maxModifications + 1][];
            for (var i = 0; i <= maxModifications; i++)
            {
                _matchedSet[i] = new SortedSet <DatabaseSequenceSpectrumMatch> [ms2ScanVector.Length];
            }

            _checkedOutScanNumbers = new List <int>();
        }
Example #11
0
        private void OutputMergedResult(TextWriter writer, TsvFileParser parser, FastaDatabase fastaDb)
        {
            var scoreColumn = parser.GetData("#MatchedFragments") ?? parser.GetData("Score");
            var qValColumn  = parser.GetData("QValue");

            for (var i = 0; i < parser.NumData; i++)
            {
                var sequence = parser.GetData("Sequence")[i];
                var scanNum  = int.Parse(parser.GetData("Scan")[i]);
                var mass     = double.Parse(parser.GetData("Mass")[i]);
                var protName = parser.GetData("ProteinName")[i];
                var protDesc = fastaDb.GetProteinDescription(protName);

                var firstResId = int.Parse(parser.GetData("Start")[i]);
                var lastResId  = int.Parse(parser.GetData("End")[i]);
                var score      = double.Parse(scoreColumn[i]);
                var mod        = parser.GetData("Modifications")[i];
                var qvalue     = (qValColumn != null) ? qValColumn[i] : "0";

                writer.Write(scanNum);
                writer.Write("\t");
                writer.Write(sequence);
                writer.Write("\t");
                writer.Write(mod);
                writer.Write("\t");
                writer.Write(mass);
                writer.Write("\t");
                writer.Write(protName);
                writer.Write("\t");
                writer.Write(protDesc);
                writer.Write("\t");
                writer.Write(firstResId);
                writer.Write("\t");
                writer.Write(lastResId);
                writer.Write("\t");
                writer.Write(score);
                writer.Write("\t");
                writer.Write(qvalue);
                writer.Write("\n");
            }
        }
Example #12
0
        public void TestSumParallel()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            //var array = Enumerable.Range(0, short.MaxValue).ToArray();
            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002216_235ACCEA.fasta";
            var          db     = new FastaDatabase(dbFile);

            db.Read();
            //var indexedDb = new IndexedDatabase(db);
            //indexedDb.Read();
            //var peptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30);
            var charArray = db.Characters().Select(c => (int)c);

            // Test methods.
            Console.WriteLine(SumAsParallel(charArray));
            Console.WriteLine(SumDefault(charArray));

            const int m  = 100;
            var       s1 = Stopwatch.StartNew();

            for (var i = 0; i < m; i++)
            {
                SumDefault(charArray);
            }
            s1.Stop();
            var s2 = Stopwatch.StartNew();

            for (var i = 0; i < m; i++)
            {
                SumAsParallel(charArray);
            }
            s2.Stop();
            Console.WriteLine((s1.Elapsed.TotalMilliseconds * 1000000 /
                               m).ToString("0.00 ns"));
            Console.WriteLine((s2.Elapsed.TotalMilliseconds * 1000000 /
                               m).ToString("0.00 ns"));
            Console.Read();
        }
Example #13
0
        public void TestTagMatchingSingleSpec()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3";
            const int    scanNum = 4533;

            // Parse sequence tags
            var          tagFileName   = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".seqtag");
            const int    minTagLength  = 8;
            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";

            if (!File.Exists(tagFileName))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFileName);
            }

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb      = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);
            var tagParser    = new SequenceTagParser(tagFileName, minTagLength);

            var tags = tagParser.GetSequenceTags(scanNum);

            foreach (var tag in tags)
            {
                var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence)
                                      .Select(index => fastaDb.GetProteinName(index)).ToArray();
                if (matchedProteins.Any())
                {
                    Console.WriteLine("{0}\t{1}\t{2}\t{3}", tag.Sequence, tag.IsPrefix, tag.FlankingMass, string.Join("\t", matchedProteins));
                }
            }
        }
Example #14
0
        public void TestTagBasedSearchForLewy()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const string rawFilePath = @"D:\MassSpecFiles\Lewy\Lewy_AT_AD1_21May15_Bane_14-09-01RZ.pbf";

            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            var run = PbfLcMsRun.GetLcMsRun(rawFilePath);

            //const int minTagLength = 4;
            var tagFilePath = MassSpecDataReaderFactory.ChangeExtension(rawFilePath, ".seqtag");
            //var tagParser = new SequenceTagParser(tagFilePath, minTagLength, 10000);

            const string fastaFilePath = @"D:\MassSpecFiles\Lewy\a4_human.fasta";

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb = new FastaDatabase(fastaFilePath);

            var tolerance    = new Tolerance(10);
            var modsFilePath = @"D:\MassSpecFiles\Lewy\Mods.txt";

            if (!File.Exists(modsFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, modsFilePath);
            }

            var aaSet = new AminoAcidSet(modsFilePath);

            TestTagBasedSearch(run, fastaDb, tolerance, aaSet);
        }
 public FeatureBasedTagSearchEngine(
     LcMsRun run,
     Ms1FtParser featureParser,
     SequenceTagParser tagParser,
     FastaDatabase fastaDb,
     Tolerance tolerance,
     AminoAcidSet aaSet,
     double maxSequenceMass  = 50000.0,
     int minProductIonCharge = 1,
     int maxProductIonCharge = 20)
     : this(
         run,
         featureParser,
         null,
         tagParser,
         fastaDb,
         tolerance,
         aaSet,
         maxSequenceMass,
         minProductIonCharge,
         maxProductIonCharge)
 {
 }
Example #16
0
        private static void TestCountingPeptides()
        {
            var aaSet = new AminoAcidSet();

            var sw = new Stopwatch();

            sw.Start();

            //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002166_F86E3B2F.fasta";
            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_003456_9B916A8B.fasta";
            //            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta";
            var db        = new FastaDatabase(dbFile);
            var indexedDb = new IndexedDatabase(db);

            indexedDb.Read();
            //var numPeptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 150).LongCount();
            var peptides =
                indexedDb.AnnotationsAndOffsets(7, 40, 2, 2, Enzyme.Trypsin);

            Parallel.ForEach(peptides, annotationAndOffset =>
                             //foreach(var annotationAndOffset in peptides)
            {
                var annotation = annotationAndOffset.Annotation;
                var offset     = annotationAndOffset.Offset;

                var graph = SequenceGraph.CreateGraph(aaSet, annotation);
            }
                             )
            ;

//            Console.WriteLine("NumPeptides: {0}", numPeptides);
            sw.Stop();
            var sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;

            Console.WriteLine(@"{0:f4} sec", sec);
        }
Example #17
0
        public void TestFeatureIdMatching()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string resultFilePath = @"H:\Research\QCShew_TopDown\Production\M1_V092\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv";

            if (!File.Exists(resultFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath);
            }

            var          resultParser    = new MsPathFinderParser(resultFilePath);
            const double qValueThreshold = 0.01;
            const double tolerancePpm    = 13;

            const string dataSet     = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3";
            var          rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw");

            if (!File.Exists(rawFileName))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFileName);
            }

            var run = PbfLcMsRun.GetLcMsRun(rawFileName);

            var idList =
                resultParser.GetIdList().TakeWhile(id => id.QValue <= qValueThreshold).OrderBy(id => id.Mass).ToList();
            var idMassList = idList.Select(id => id.Mass).ToList();
            var idFlag     = new bool[idList.Count];


            // Parse sequence tags
            var       tagFileName    = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".seqtag");
            const int minTagLength   = 6;
            const int numProtMatches = 4;
//            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";
            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta";

            if (!File.Exists(tagFileName))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFileName);
            }

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb      = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);

            var tagParser = new SequenceTagParser(tagFileName, minTagLength);

            var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft");
            var featureParser   = new TsvFileParser(featureFileName);

            var minScan   = featureParser.GetData("MinScan").Select(s => Convert.ToInt32(s)).ToArray();
            var maxScan   = featureParser.GetData("MaxScan").Select(s => Convert.ToInt32(s)).ToArray();
            var minCharge = featureParser.GetData("MinCharge").Select(s => Convert.ToInt32(s)).ToArray();
            var maxCharge = featureParser.GetData("MaxCharge").Select(s => Convert.ToInt32(s)).ToArray();
            var monoMass  = featureParser.GetData("MonoMass").Select(Convert.ToDouble).ToArray();

            var numFeaturesWithId                    = 0;
            var numFeaturesWithMs2                   = 0;
            var numFeaturesWithTags                  = 0;
            var numFeaturesWithMatchingTags          = 0;
            var numFeaturesWithTwoOrMoreMatchingTags = 0;
            var numFeaturesWithNoIdAndMatchingTags   = 0;

            for (var i = 0; i < featureParser.NumData; i++)
            {
                var mass = monoMass[i];

                // Find Id
                var tolDa   = new Tolerance(tolerancePpm).GetToleranceAsDa(mass, 1);
                var minMass = mass - tolDa;
                var maxMass = mass + tolDa;
                var index   = idMassList.BinarySearch(mass);
                if (index < 0)
                {
                    index = ~index;
                }

                var matchedId = new List <MsPathFinderId>();
                // go down
                var curIndex = index - 1;
                while (curIndex >= 0)
                {
                    var curId = idList[curIndex];
                    if (curId.Mass < minMass)
                    {
                        break;
                    }
                    if (curId.Scan > minScan[i] && curId.Scan < maxScan[i] &&
                        curId.Charge >= minCharge[i] && curId.Charge <= maxCharge[i])
                    {
                        matchedId.Add(curId);
                        idFlag[curIndex] = true;
                    }
                    --curIndex;
                }

                // go up
                curIndex = index;
                while (curIndex < idList.Count)
                {
                    var curId = idList[curIndex];
                    if (curId.Mass > maxMass)
                    {
                        break;
                    }
                    if (curId.Scan >= minScan[i] && curId.Scan <= maxScan[i] &&
                        curId.Charge >= minCharge[i] && curId.Charge <= maxCharge[i])
                    {
                        matchedId.Add(curId);
                        idFlag[curIndex] = true;
                    }
                    ++curIndex;
                }

                var hasId = false;
                if (matchedId.Any())
                {
                    ++numFeaturesWithId;
                    hasId = true;
                }

                // Find MS2 scans
//                var numMs2Scans = 0;
                var tags   = new List <SequenceTag>();
                var hasMs2 = false;
                for (var scanNum = minScan[i]; scanNum <= maxScan[i]; scanNum++)
                {
                    var isolationWindow = run.GetIsolationWindow(scanNum);
                    if (isolationWindow == null)
                    {
                        continue;
                    }
                    var isolationWindowTargetMz = isolationWindow.IsolationWindowTargetMz;
                    var charge = (int)Math.Round(mass / isolationWindowTargetMz);
                    if (charge < minCharge[i] || charge > maxCharge[i])
                    {
                        continue;
                    }
                    var mz = Ion.GetIsotopeMz(mass, charge,
                                              Averagine.GetIsotopomerEnvelope(mass).MostAbundantIsotopeIndex);
                    if (isolationWindow.Contains(mz))
                    {
//                        ++numMs2Scans;
                        tags.AddRange(tagParser.GetSequenceTags(scanNum));
                        hasMs2 = true;
                    }
                }
                if (hasMs2)
                {
                    ++numFeaturesWithMs2;
                }
                if (tags.Any())
                {
                    ++numFeaturesWithTags;
                }
                var protHist      = new Dictionary <string, int>();
                var hasMatchedTag = false;
                foreach (var tag in tags)
                {
                    var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).Select(idx => fastaDb.GetProteinName(idx)).ToArray();
                    if (matchedProteins.Any())
                    {
                        hasMatchedTag = true;
                        foreach (var protein in matchedProteins)
                        {
                            int num;
                            if (protHist.TryGetValue(protein, out num))
                            {
                                protHist[protein] = num + 1;
                            }
                            else
                            {
                                protHist[protein] = 1;
                            }
                        }
                    }
                }
                if (hasMatchedTag)
                {
                    ++numFeaturesWithMatchingTags;
                    if (!hasId)
                    {
                        ++numFeaturesWithNoIdAndMatchingTags;
                    }
                }
                if (protHist.Any())
                {
                    var maxOcc = protHist.Values.Max();
                    if (maxOcc >= numProtMatches)
                    {
                        ++numFeaturesWithTwoOrMoreMatchingTags;
                    }
                }
            }

            Console.WriteLine("NumFeatures: {0}", featureParser.NumData);
            Console.WriteLine("NumId: {0}", idList.Count);
            Console.WriteLine("NumFeaturesWithId: {0} ({1})", numFeaturesWithId, numFeaturesWithId / (float)featureParser.NumData);
            Console.WriteLine("NumFeaturesWithMs2: {0} ({1})", numFeaturesWithMs2, numFeaturesWithMs2 / (float)featureParser.NumData);
            Console.WriteLine("NumFeaturesWithTag: {0} ({1})", numFeaturesWithTags, numFeaturesWithTags / (float)featureParser.NumData);
            Console.WriteLine("NumFeaturesWithMatchedTag: {0} ({1})", numFeaturesWithMatchingTags, numFeaturesWithMatchingTags / (float)featureParser.NumData);
            Console.WriteLine("NumFeaturesWithMoreThanOneMatchedTag: {0} ({1})", numFeaturesWithTwoOrMoreMatchingTags, numFeaturesWithTwoOrMoreMatchingTags / (float)featureParser.NumData);
            Console.WriteLine("NumFeaturesWithNoIdAndMatchedTag: {0} ({1})", numFeaturesWithNoIdAndMatchingTags, numFeaturesWithNoIdAndMatchingTags / (float)featureParser.NumData);

            for (var i = 0; i < idFlag.Length; i++)
            {
                if (!idFlag[i])
                {
                    Console.WriteLine(idList[i].Scan);
                }
            }


//            Console.WriteLine(string.Join(",", filter.GetMatchingMs2ScanNums(8115.973001)));
//
//            Console.WriteLine(featureFileName);
        }
Example #18
0
        public void FindProteinDeltaMass()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const string folderPath = @"D:\MassSpecFiles\Glyco\";

            if (!Directory.Exists(folderPath))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, folderPath);
            }

            var fileSet = new string[]
            {
                "User_sample_test_02252015", "User_sample_test_MWCO_02262016", "User_sample_test_SEC_F3_03022105",
                "User_sample_test_SEC_F1_02272015", "User_sample_test_SEC_F2_02282015"
            };
            const string fastaFilePath = folderPath + "ID_003836_DA9CC1E4.fasta";

            for (var i = 0; i < fileSet.Length; i++)
            {
                var datasetName = fileSet[i];
                var tagFilePath = folderPath + datasetName + ".seqtag";
                //var outputFilePath = folderPath + datasetName + ".matchedtag";
                var outputFilePath = folderPath + datasetName + ".dmass";
                var fastaDb        = new FastaDatabase(fastaFilePath);
                var searchableDb   = new SearchableDatabase(fastaDb);

                using (var writer = new StreamWriter(outputFilePath))
                {
                    var isHeader    = true;
                    var nReadSeqTag = 0;

                    Console.WriteLine(@"Reading {0} file", tagFilePath);

                    var nColumn = 0;
                    foreach (var line in File.ReadAllLines(tagFilePath))
                    {
                        if (isHeader)
                        {
                            isHeader = false;
                            nColumn  = line.Split('\t').Length;
                            writer.WriteLine(line + "\t" + "Protein" + "\t" + "DetectedFlankingMass" + "\t" + "ExpectedFlankingMass" + "\t" + "DeltaMass");
                            continue;
                        }

                        var token = line.Split('\t');
                        if (token.Length != nColumn)
                        {
                            continue;
                        }
                        var tag = token[1];
                        //var scan = Convert.ToInt32(token[0]);

                        if (tag.Length < 6)
                        {
                            continue;
                        }

                        var nTerminal            = token[2].Equals("1");
                        var detectedFlankingMass = Double.Parse(token[3]);

                        if (!nTerminal)
                        {
                            detectedFlankingMass -= Composition.H2O.Mass;
                        }

                        nReadSeqTag++;

                        var matchedProteins =
                            searchableDb.FindAllMatchedSequenceIndices(tag)
                            .Select(index => fastaDb.GetProteinName(index))
                            .Distinct().ToArray();

                        if (matchedProteins.Length < 1)
                        {
                            continue;
                        }

                        foreach (var protName in matchedProteins)
                        {
                            var seqStr = fastaDb.GetProteinSequence(protName);
                            var oriSeq = new Sequence(seqStr, AminoAcidSet.GetStandardAminoAcidSet());

                            var startIdx = 0;
                            while (true)
                            {
                                var idx = seqStr.IndexOf(tag, startIdx);

                                if (idx < 0)
                                {
                                    break;          //no matching
                                }
                                //var nClv = (nTerminal) ? idx : seqStr.Length - idx - tag.Length;
                                var nClv = (nTerminal) ? 2 : 1;

                                for (var j = 0; j < nClv; j++)
                                {
                                    var flankComposition = (nTerminal)
                                        ? oriSeq.GetComposition(j, idx)
                                        : oriSeq.GetComposition(idx + tag.Length, oriSeq.Count - j);

                                    var massDiff = (detectedFlankingMass - flankComposition.Mass);
                                    if (massDiff > -500 && massDiff < 2000)
                                    {
                                        //writer.WriteLine(massDiff);
                                        writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", line, protName, detectedFlankingMass, flankComposition.Mass, massDiff);
                                    }

                                    if (massDiff > 2000)
                                    {
                                        break;
                                    }
                                }

                                startIdx = idx + tag.Length;
                            }
                        }
                        //var matchedProteinStr = string.Join(",", matchedProteins);
                        //var massDiffStr = string.Join(",", massDiffList);
                        //writer.WriteLine("{0}\t{1}\t{2}\t{3}", line, matchedProteins.Length, matchedProteinStr, massDiffStr);
                    }

                    Console.WriteLine(@"{0} seq tags are processed", nReadSeqTag);
                }
                Console.WriteLine(@"Done");
            }
        }
Example #19
0
        public void CountMatchedScansPerProtein()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const int minTagLength = 6;

            var          proteinToScan = new Dictionary <string, HashSet <int> >();
            const string fastaFilePath = @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta";

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb      = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);

            Console.WriteLine(@"Sequence length: {0}", fastaDb.GetSequence().Length);

            //const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv";
            //const string tagFilePath = @"\\protoapps\UserData\Jungkap\Co_culture\23B_pellet_TD_3Feb14_Bane_PL011402.seqtag";
            const string tagFilePath = @"D:\MassSpecFiles\co_culture\23A_pellet_TD_3Feb14_Bane_PL011402.seqtag";

            if (!File.Exists(tagFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath);
            }

            var isHeader        = true;
            var numMatchedPairs = 0;

            foreach (var line in File.ReadAllLines(tagFilePath))
            {
                if (isHeader)
                {
                    isHeader = false;
                    continue;
                }

                var token = line.Split('\t');
                if (token.Length != 3)
                {
                    continue;
                }
                var scan = Convert.ToInt32(token[0]);

                var tag = token[1];
                if (tag.Length < minTagLength)
                {
                    continue;
                }

                foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag)
                         .Select(index => fastaDb.GetProteinName(index)))
                {
                    ++numMatchedPairs;
                    HashSet <int> matchedScans;
                    if (proteinToScan.TryGetValue(matchedProtein, out matchedScans))
                    {
                        matchedScans.Add(scan);
                    }
                    else
                    {
                        matchedScans = new HashSet <int> {
                            scan
                        };
                        proteinToScan.Add(matchedProtein, matchedScans);
                    }
                }
            }

            var numMatchedProteins = proteinToScan.Keys.Count;
            var numAllProteins     = fastaDb.GetNumEntries();

            Console.WriteLine("NumAllProteins: {0}", numAllProteins);
            Console.WriteLine("NumMatchedProteins: {0}", numMatchedProteins);
            Console.WriteLine("AvgMatchedScansPerProtein: {0}", numMatchedPairs / (float)numAllProteins);
        }
Example #20
0
        public void CountMatchedProteins()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const int minTagLength = 3;

            var          scanToProtein  = new Dictionary <int, string>();
            var          idTag          = new Dictionary <int, bool>();
            const string resultFilePath = @"H:\Research\ProMex\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv";

            if (!File.Exists(resultFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath);
            }

            var parser       = new TsvFileParser(resultFilePath);
            var scans        = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray();
            var proteinNames = parser.GetData("ProteinName").ToArray();
            var qValues      = parser.GetData("QValue").Select(Convert.ToDouble).ToArray();

            for (var i = 0; i < qValues.Length; i++)
            {
                if (qValues[i] > 0.01)
                {
                    break;
                }
                scanToProtein.Add(scans[i], proteinNames[i]);
                idTag.Add(scans[i], false);
            }

            const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw";

            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            var run = PbfLcMsRun.GetLcMsRun(rawFilePath);

            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

//            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta";
//            const string fastaFilePath =
//                @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta";
            var fastaDb      = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);

            Console.WriteLine("Sequence length: {0}", fastaDb.GetSequence().Length);

            const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.seqtag";

            if (!File.Exists(tagFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath);
            }

            var hist = new Dictionary <int, int>();

            var scanSet = new HashSet <int>();
            HashSet <string> proteinSetForThisScan = null;
            var prevScan        = -1;
            var totalNumMatches = 0L;
            var isHeader        = true;

            foreach (var line in File.ReadAllLines(tagFilePath))
            {
                if (isHeader)
                {
                    isHeader = false;
                    continue;
                }

                var token = line.Split('\t');
                if (token.Length < 3)
                {
                    continue;
                }
                var scan      = Convert.ToInt32(token[0]);
                var proteinId = scanToProtein.ContainsKey(scan) ? scanToProtein[scan] : null;

                if (scan != prevScan)
                {
                    if (proteinSetForThisScan != null)
                    {
                        var numMatches = proteinSetForThisScan.Count;
                        int numOcc;
                        if (hist.TryGetValue(numMatches, out numOcc))
                        {
                            hist[numMatches] = numOcc + 1;
                        }
                        else
                        {
                            hist.Add(numMatches, 1);
                        }
                    }

                    prevScan = scan;
                    proteinSetForThisScan = new HashSet <string>();
                }

                scanSet.Add(scan);
                var tag = token[1];
                if (tag.Length < minTagLength)
                {
                    continue;
                }

                if (proteinSetForThisScan == null)
                {
                    continue;
                }

                var numMatchesForThisTag = 0;
                foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag)
                         .Select(index => fastaDb.GetProteinName(index)))
                {
                    proteinSetForThisScan.Add(matchedProtein);
                    ++numMatchesForThisTag;

                    if (proteinId != null && matchedProtein.Equals(proteinId))
                    {
                        idTag[scan] = true;
                    }
                }
                totalNumMatches += numMatchesForThisTag;
//                if (numMatchesForThisTag > 10)
//                {
//                    Console.WriteLine("{0}\t{1}", tag, numMatchesForThisTag);
//                }
            }

            if (proteinSetForThisScan != null)
            {
                var numMatches = proteinSetForThisScan.Count;
                int numOcc;
                if (hist.TryGetValue(numMatches, out numOcc))
                {
                    hist[numMatches] = numOcc + 1;
                }
                else
                {
                    hist.Add(numMatches, 1);
                }
            }

            Console.WriteLine("AvgNumMatches: {0}", totalNumMatches / (float)scanSet.Count);
            Console.WriteLine("Histogram:");
            foreach (var entry in hist.OrderBy(e => e.Key))
            {
                Console.WriteLine("{0}\t{1}", entry.Key, entry.Value);
            }

            Console.WriteLine("NumId: {0}", idTag.Count);
            Console.WriteLine("NumIdByTag: {0}", idTag.Select(e => e.Value).Count(v => v));
        }
Example #21
0
        [TestCase(3, @"TEST_FOLDER\MSPathFinderT\ID_005133_8491EFA2.fasta", 323719193)]   // 3MB
        //[TestCase(6, @"TEST_FOLDER\MSPathFinderT\ID_004530_B63BD900.fasta", 595227563)]  // 6MB
        //[TestCase(15, @"TEST_FOLDER\MSPathFinderT\ID_004208_295531A4.fasta", 1882434687)]  // 15MB
        public void TestSequenceEnumeration(double size, string dbFile, int expected)
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName, dbFile);

            var fastaFile = Utils.GetTestFile(methodName, dbFile.Replace("TEST_FOLDER", Utils.DEFAULT_TEST_FILE_FOLDER));

            var sw = new System.Diagnostics.Stopwatch();

            sw.Start();

            var db           = new FastaDatabase(fastaFile.FullName);
            var indexedDb    = new IndexedDatabase(db);
            var numSequences = 0L;
            var timeDB       = sw.Elapsed;

            Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds");
            var estimatedAnnOff = indexedDb.EstimateTotalPeptides(0, 30, 250);
            var timeEstimate    = sw.Elapsed;

            Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds");
            //int coreCount = 0;
            //foreach (var item in new System.Management.ManagementObjectSearcher("Select NumberOfCores from Win32_Processor").Get())
            //{
            //    coreCount += int.Parse(item["NumberOfCores"].ToString());
            //}
            //Console.WriteLine("Number Of Cores: {0}", coreCount);
            //Console.WriteLine("Processors: " + System.Environment.ProcessorCount);
            Console.WriteLine("Estimated results: " + estimatedAnnOff);
            var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzymeParallel(30, 250);
            var timeGetAnn            = sw.Elapsed;

            Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds");

            /*/Parallel.ForEach(
             *  annotationsAndOffsets,
             *  //                new ParallelOptions { MaxDegreeOfParallelism = 2},
             *  annotationAndOffset =>
             *  {
             *      Interlocked.Increment(ref numSequences);
             *      //++numSequences;
             *  }
             *  );/**/
            //annotationsAndOffsets.Select(annotationsAndOffset => annotationsAndOffset.)
            // Below, original: 110, 109(total) seconds
            // Parallelizing AnnotationsAndOffsetsNoEnzyme: 86 seconds
            // Parallelizing AnnotationsAndOffsetsNoEnzyme, yield returns: 79.6, 94, 60, 60 seconds
            //
            // 3MB
            // serial:
            // Parallel2: 107,
            //
            // 6MB
            // serial:
            // Parallel2:
            //
            // 15MB
            // serial:
            // Parallel2:
            //using (var ofstream = new FileStream(Path.Combine(@"F:\InformedProteomicsTestFiles", Path.GetFileNameWithoutExtension(fastaFile) + "_par.txt"), FileMode.Create))
            //using (var fout = new StreamWriter(ofstream))
            //{
            //    foreach (var annOff in annotationsAndOffsets)
            //    {
            //        numSequences++;
            //        fout.WriteLine(annOff.Annotation);
            //    }
            //}
            numSequences = annotationsAndOffsets.Count();
            var timeParForEach = sw.Elapsed;

            Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds");

            Console.WriteLine("NumPeptides: {0}", numSequences);
            sw.Stop();

            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
            //Assert.AreEqual(188961836, numSequences);
            Assert.AreEqual(expected, numSequences);
        }
Example #22
0
 public MzidResultsWriter(FastaDatabase db, LcMsRun run, MsPfParameters options)
 {
     this.database = db;
     this.lcmsRun  = run;
     this.options  = options;
 }
Example #23
0
        [TestCase(15, @"TEST_FOLDER\MSPathFinderT\ID_004208_295531A4.fasta", 14862126)] // 15MB
        public void TestSequenceEnumerationNCTerm(double size, string dbFile, int expected)
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName, dbFile);

            var fastaFile = Utils.GetTestFile(methodName, dbFile.Replace("TEST_FOLDER", Utils.DEFAULT_TEST_FILE_FOLDER));

            var sw = new System.Diagnostics.Stopwatch();

            sw.Start();

            const int numNTermCleavages = 1;
            const int numCTermCleavages = 0;
            var       db           = new FastaDatabase(fastaFile.FullName);
            var       indexedDb    = new IndexedDatabase(db);
            var       numSequences = 0L;
            var       timeDB       = sw.Elapsed;

            Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds");
            var estimatedAnnOff = indexedDb.EstimateTotalPeptides(1, 21, 300, numNTermCleavages, numCTermCleavages);
            var timeEstimate    = sw.Elapsed;

            Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds");
            Console.WriteLine("Estimated results: " + estimatedAnnOff);
            var annotationsAndOffsets = indexedDb.SequenceAnnotationsAndOffsetsWithNtermOrCtermCleavageNoLargerThan(21, 300, numNTermCleavages, numCTermCleavages);
            var timeGetAnn            = sw.Elapsed;

            Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds");

            /*/Parallel.ForEach(
             *  annotationsAndOffsets,
             *  //                new ParallelOptions { MaxDegreeOfParallelism = 2},
             *  annotationAndOffset =>
             *  {
             *      Interlocked.Increment(ref numSequences);
             *      //++numSequences;
             *  }
             *  );/**/
            //using (var ofstream = new FileStream(Path.Combine(@"F:\InformedProteomicsTestFiles", Path.GetFileNameWithoutExtension(fastaFile) + "_par.txt"), FileMode.Create))
            //using (var fout = new StreamWriter(ofstream))
            //{
            //    foreach (var annOff in annotationsAndOffsets)
            //    {
            //        numSequences++;
            //        fout.WriteLine(annOff.Annotation);
            //    }
            //}
            //foreach (var sao in annotationsAndOffsets)
            //{
            //    numSequences++;
            //}
            numSequences = annotationsAndOffsets.Count();
            var timeParForEach = sw.Elapsed;

            Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds");

            Console.WriteLine("NumPeptides: {0}", numSequences);
            sw.Stop();

            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
            //Assert.AreEqual(188961836, numSequences);
            Assert.AreEqual(expected, numSequences);
        }
Example #24
0
        public void TestCountingProteoformsCloseToNTermOrCTerm()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const int minSequenceLength    = 21;  // 21
            const int maxSequenceLength    = 300; // 300
            const int maxNumNTermCleavages = 1;
            const int maxNumCTermCleavages = 0;

            var sw = new System.Diagnostics.Stopwatch();

            sw.Start();

            //const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownQCShew\database\ID_002216_235ACCEA.fasta";
            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta";

            if (!File.Exists(dbFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile);
            }

            var db        = new FastaDatabase(dbFile);
            var indexedDb = new IndexedDatabase(db);

            var both      = 0L;
            var nTermOnly = 0L;
            var cTermOnly = 0L;

            foreach (
                var annotationAndOffset in
                indexedDb.IntactSequenceAnnotationsAndOffsets(minSequenceLength, int.MaxValue,
                                                              maxNumCTermCleavages))
            {
                // numCTermCleavages <= maxNumCTermCleavages
                var annotation       = annotationAndOffset.Annotation;
                var length           = (annotation.Length - 4);
                var numNTermCleavage = 0;
                int cleavedLength;
                while ((cleavedLength = length - numNTermCleavage) >= minSequenceLength)
                {
                    if (cleavedLength <= maxSequenceLength)
                    {
                        if (numNTermCleavage <= maxNumNTermCleavages)
                        {
                            ++both;
                        }
                        else
                        {
                            ++cTermOnly;
                        }
                        var anno = numNTermCleavage == 0
                            ? annotation
                            : string.Format("{0}.{1}", annotation[1 + numNTermCleavage], annotation.Substring(2 + numNTermCleavage));
                        Console.WriteLine(anno);
                    }
                    ++numNTermCleavage;
                }
            }

            foreach (
                var annotationAndOffset in
                indexedDb.IntactSequenceAnnotationsAndOffsetsWithCTermCleavagesLargerThan(minSequenceLength, int.MaxValue,
                                                                                          maxNumCTermCleavages))
            {
                // numCTermCleavages > maxNumCTermCleavages
                var annotation = annotationAndOffset.Annotation;
                var length     = (annotation.Length - 4);
                for (var numNTermCleavage = 0; numNTermCleavage <= maxNumNTermCleavages; numNTermCleavage++)
                {
                    var cleavedLength = length - numNTermCleavage;
                    if (cleavedLength >= minSequenceLength && cleavedLength <= maxSequenceLength)
                    {
                        ++nTermOnly;
                        var anno = numNTermCleavage == 0
                            ? annotation
                            : string.Format("{0}.{1}", annotation[1 + numNTermCleavage], annotation.Substring(2 + numNTermCleavage));
                        Console.WriteLine(anno);
                    }
                }
            }

            Console.WriteLine("Both: {0}", both);
            Console.WriteLine("N-term only: {0}", nTermOnly);
            Console.WriteLine("C-term only: {0}", cTermOnly);
            Console.WriteLine("All: {0}", both + nTermOnly + cTermOnly);
            sw.Stop();

            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
        }
Example #25
0
        public void TestGetProteinsWithTagMatchingSingleSpec()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string dataSet = @"H:\Research\Lewy\raw\Lewy_intact_07";
            //            const int scanNum = 5158;
            const int minTagLength     = 7;
            const int minNumTagMatches = 1;
            var       aminoAcidSet     = AminoAcidSet.GetStandardAminoAcidSet();

            const int scanNum = 2;
            // Parse sequence tags
            //const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag");

            const string rawFilePath = "";

            const string fastaFilePath = @"H:\Research\Lewy\ID_004858_0EE8CF61.fasta";

            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb      = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);
            //var tagParser = new SequenceTagParser(tagFileName, minTagLength);
            //var tags = tagParser.GetSequenceTags(scanNum);
            var run       = PbfLcMsRun.GetLcMsRun(rawFilePath);
            var spec      = run.GetSpectrum(scanNum) as ProductSpectrum;
            var tagFinder = new SequenceTagFinder(spec, new Tolerance(5));
            var tags      = tagFinder.GetAllSequenceTagString();

            var proteinsToTags = new Dictionary <string, IList <MatchedTag> >();

            foreach (var tag in tags)
            {
                var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray();
                foreach (var index in matchedIndices)
                {
                    var protein    = fastaDb.GetProteinName(index);
                    var startIndex = fastaDb.GetOneBasedPositionInProtein(index);
                    var matchedTag = new MatchedTag(tag, startIndex, 0.0);
                    IList <MatchedTag> existingTags;
                    if (proteinsToTags.TryGetValue(protein, out existingTags))
                    {
                        existingTags.Add(matchedTag);
                    }
                    else
                    {
                        proteinsToTags.Add(protein, new List <MatchedTag> {
                            matchedTag
                        });
                    }
                }
            }

            foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count))
            {
                if (entry.Value.Count < minNumTagMatches)
                {
                    break;
                }
                var proteinName     = entry.Key;
                var proteinSequence = fastaDb.GetProteinSequence(proteinName);
                var protein         = new Sequence(proteinSequence, aminoAcidSet);
                Console.WriteLine(proteinName + "\t" + entry.Value.Count);
                foreach (var matchedTag in entry.Value)
                {
                    var seq = proteinSequence.Substring(matchedTag.StartIndex,
                                                        matchedTag.EndIndex - matchedTag.StartIndex);
                    var nTermMass = protein.GetMass(0, matchedTag.StartIndex);
                    var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count);
                    Console.WriteLine("\t{0} ({1})\t{2}\t{3} ({4})\t{5}\t{6}\t{7}",
                                      matchedTag.NTermFlankingMass, (matchedTag.NTermFlankingMass - nTermMass),
                                      seq,
                                      matchedTag.CTermFlankingMass, (matchedTag.CTermFlankingMass - cTermMass),
                                      matchedTag.StartIndex,
                                      matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable);
                }
            }
        }
Example #26
0
        public void TestFasta()
        {
            var db = new FastaDatabase(@"\\protoapps\UserData\Jungkap\Lewy\db\ID_005140_7A170668.fasta");

            Console.WriteLine(db.GetNumEntries());
        }
Example #27
0
        public void TestNominalMassErrors()
        {
            const int MAX_RUNTIME_SECONDS = 60;

            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const int minLength = 300;
            const int maxLength = 400;

            var sw = new System.Diagnostics.Stopwatch();

            var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\ID_003962_71E1A1D4.fasta"));

            var db = new FastaDatabase(fastaFile.FullName);

            db.Read();
            var indexedDb    = new IndexedDatabase(db);
            var numSequences = 0L;

            sw.Start();

            var hist  = new long[11];
            var aaSet = new AminoAcidSet();

            foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsetsNoEnzyme(minLength, maxLength))
            {
                ++numSequences;
                var annotation   = peptideAnnotationAndOffset.Annotation;
                var sequenceStr  = annotation.Substring(2, annotation.Length - 4);
                var sequenceComp = aaSet.GetComposition(sequenceStr);
                var mass         = sequenceComp.Mass;
                var nominalMass  = sequenceComp.NominalMass;
                var error        = (int)Math.Round(mass * Constants.RescalingConstant) - nominalMass;
                var errorBin     = error + hist.Length / 2;
                if (errorBin < 0)
                {
                    errorBin = 0;
                }
                if (errorBin >= hist.Length)
                {
                    errorBin = hist.Length - 1;
                }
                hist[errorBin]++;

                if (numSequences % 100 == 0 && sw.Elapsed.TotalSeconds > MAX_RUNTIME_SECONDS)
                {
                    break;
                }
            }

            Console.WriteLine("Sequence count: {0:N0}", numSequences);
            Console.WriteLine("{0,10}  {1,10}  {2,10}", "Bin ", "Count", "Fraction");
            for (var i = 0; i < hist.Length; i++)
            {
                Console.WriteLine("{0,10:F1}  {1,10:N0}  {2,10:F1}%", i - hist.Length / 2, hist[i], hist[i] / (double)numSequences * 100);
            }

            sw.Stop();

            Console.WriteLine(@"Elapsed Time: {0:F1} sec", sw.Elapsed.TotalSeconds);
        }
Example #28
0
        public void TestRunningTimeChromGen()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string rafFilePath = @"C:\cygwin\home\kims336\Data\QCShewQE\QC_Shew_13_04_A_17Feb14_Samwise_13-07-28.raf";

            if (!File.Exists(rafFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rafFilePath);
            }

            var rafRun = new PbfLcMsRun(rafFilePath);

            var tolerance = new Tolerance(10);

            const string dbFile = @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta";

            if (!File.Exists(dbFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile);
            }

            var db        = new FastaDatabase(dbFile);
            var indexedDb = new IndexedDatabase(db);
            var aaSet     = new AminoAcidSet(Modification.Carbamidomethylation);

            var sw = new System.Diagnostics.Stopwatch();

            sw.Start();
            var numPeptides = 0;

            foreach (var peptide in indexedDb.AnnotationsAndOffsets(6, 30, 2, 2, Enzyme.Trypsin))
            {
                ++numPeptides;
                var comp = new Sequence(peptide.Annotation.Substring(2, peptide.Annotation.Length - 4), aaSet).Composition + Composition.H2O;
                var mz   = new Ion(comp, 2).GetMonoIsotopicMz();
                //Console.WriteLine(peptide.Annotation + " " + mz);
                rafRun.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance);
                //run.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance);

                //var xic1 = run.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance);
                //var xic2 = rafRun.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance);
                //Assert.True(xic1.Count == xic2.Count);
                //for (var i = 0; i < xic1.Count; i++)
                //{
                //    if (!xic1[i].Equals(xic2[i]))
                //    {
                //        Console.WriteLine("{0} {1} {2}", i, xic1[i], xic2[i]);
                //    }
                //    Assert.True(xic1[i].Equals(xic2[i]));
                //}

                if (numPeptides == 100000)
                {
                    break;
                }
            }
            sw.Stop();

            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
        }
Example #29
0
        public void TestTagMatching()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            // Parse sequence tags
            const string dataSet      = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3";
            const int    minTagLength = 8;
            var          tagFileName  = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".seqtag");

            if (!File.Exists(tagFileName))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFileName);
            }

            var tagParser = new SequenceTagParser(tagFileName, minTagLength);

            // Parse raw file
            var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw");

            if (!File.Exists(rawFileName))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFileName);
            }

            var run = PbfLcMsRun.GetLcMsRun(rawFileName);

            // Parse ID file
            const string resultFilePath = @"H:\Research\QCShew_TopDown\Production\M1_V092\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv";

            if (!File.Exists(resultFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath);
            }

            var          resultParser    = new MsPathFinderParser(resultFilePath);
            const double qValueThreshold = 0.01;
            var          idList          = resultParser.GetIdWithQValuesNoLargerThan(qValueThreshold);
            var          idFlag          = new bool[run.MaxLcScan + 1];

            foreach (var id in idList)
            {
                idFlag[id.Scan] = true;
            }

            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

//            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta";
            var fastaDb      = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);

            var numMs2Spectra                = 0;
            var numSpectraWithTag            = 0;
            var numSpectraWithMatchingTag    = 0;
            var numSpectraWithMatchedTagNoId = 0;

            foreach (var ms2ScanNum in run.GetScanNumbers(2))
            {
                ++numMs2Spectra;
                var tags = tagParser.GetSequenceTags(ms2ScanNum);
                if (tags != null)
                {
                    ++numSpectraWithTag;
                    foreach (var tag in tags)
                    {
                        if (searchableDb.Search(tag.Sequence) >= 0)
                        {
                            //Console.WriteLine(tag.Sequence);
                            ++numSpectraWithMatchingTag;
                            if (!idFlag[ms2ScanNum])
                            {
                                ++numSpectraWithMatchedTagNoId;
                            }
                            break;
                        }
                    }
                }
            }
            Console.WriteLine("Tag length: {0}", minTagLength);
            Console.WriteLine("NumMs2Spectra: {0}", numMs2Spectra);
            Console.WriteLine("NumMs2SpectraWithTags: {0} ({1})", numSpectraWithTag, numSpectraWithTag / (float)numMs2Spectra);
            Console.WriteLine("NumMs2SpectraWithMatchingTags: {0} ({1})", numSpectraWithMatchingTag, numSpectraWithMatchingTag / (float)numMs2Spectra);
            Console.WriteLine("NumMs2SpectraWithMatchingTagsWithNoId: {0} ({1})", numSpectraWithMatchedTagNoId, numSpectraWithMatchedTagNoId / (float)numMs2Spectra);
        }
Example #30
0
        public void TestFeatureId()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3";

            if (!File.Exists(dataSet))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dataSet);
            }

            // Feature: 5236-5286	6-12	8480.3681	5
            const int    minScanNum  = 5236;
            const int    maxScanNum  = 5286;
            const double featureMass = 8480.3681;

            //const int minScanNum = 7251;
            //const int maxScanNum = 7326;
            //const double featureMass = 32347.18;

//            const int minScanNum = 4451;
//            const int maxScanNum = 4541;
//            const double featureMass = 31267.95;

            var tolerance        = new Tolerance(10);
            var relaxedTolerance = new Tolerance(20);

            const int minTagLength       = 5;
            const int minMergedTagLength = 7;
            const int minNumTagMatches   = 1;

            var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw");
            var run         = PbfLcMsRun.GetLcMsRun(rawFileName);

            var aminoAcidSet    = AminoAcidSet.GetStandardAminoAcidSet();
            var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft");
            var filter          = new Ms1FtFilter(run, tolerance, featureFileName);
            var ms2ScanNums     =
                filter.GetMatchingMs2ScanNums(featureMass)
                .Where(scanNum => scanNum > minScanNum && scanNum < maxScanNum)
                .ToArray();

            const string tagFileName   = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag");
            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb      = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);
            var tagParser    = new SequenceTagParser(tagFileName, minTagLength);

            var proteinsToTags = new Dictionary <string, IList <MatchedTag> >();

            foreach (var ms2ScanNum in ms2ScanNums)
            {
                var tags = tagParser.GetSequenceTags(ms2ScanNum);
                foreach (var tag in tags)
                {
                    var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray();
                    foreach (var index in matchedIndices)
                    {
                        var protein    = fastaDb.GetProteinName(index);
                        var startIndex = fastaDb.GetZeroBasedPositionInProtein(index);
                        var matchedTag = new MatchedTag(tag, startIndex, featureMass);
                        IList <MatchedTag> existingTags;
                        if (proteinsToTags.TryGetValue(protein, out existingTags))
                        {
                            existingTags.Add(matchedTag);
                        }
                        else
                        {
                            proteinsToTags.Add(protein, new List <MatchedTag> {
                                matchedTag
                            });
                        }
                    }
                }
            }

            foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count))
            {
                if (entry.Value.Count < minNumTagMatches)
                {
                    break;
                }
                var proteinName     = entry.Key;
                var proteinSequence = fastaDb.GetProteinSequence(proteinName);
                var protein         = new Sequence(proteinSequence, aminoAcidSet);
                Console.WriteLine(proteinName + "\t" + entry.Value.Count);

                var matchedTagSet = new MatchedTagSet(proteinSequence, aminoAcidSet,
                                                      tolerance, relaxedTolerance);

                Console.WriteLine("********** Before merging");
                foreach (var matchedTag in entry.Value)
                {
                    var seq = proteinSequence.Substring(matchedTag.StartIndex,
                                                        matchedTag.EndIndex - matchedTag.StartIndex);
                    var nTermMass = protein.GetMass(0, matchedTag.StartIndex);
                    var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count);
                    Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}",
                                      (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex,
                                      matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable);

                    matchedTagSet.Add(matchedTag);
                }

                Console.WriteLine("********** After merging");
                foreach (var matchedTag in matchedTagSet.Tags)
                {
                    if (matchedTag.Length < minMergedTagLength)
                    {
                        continue;
                    }
                    var seq = proteinSequence.Substring(matchedTag.StartIndex,
                                                        matchedTag.EndIndex - matchedTag.StartIndex);
                    var nTermMass = protein.GetMass(0, matchedTag.StartIndex);
                    var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count);
                    Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}",
                                      (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex,
                                      matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable);
                }

                break;
            }
        }