示例#1
0
        public void TestFeatureId()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3";

            if (!File.Exists(dataSet))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dataSet);
            }

            // Feature: 5236-5286	6-12	8480.3681	5
            const int    minScanNum  = 5236;
            const int    maxScanNum  = 5286;
            const double featureMass = 8480.3681;

            //const int minScanNum = 7251;
            //const int maxScanNum = 7326;
            //const double featureMass = 32347.18;

//            const int minScanNum = 4451;
//            const int maxScanNum = 4541;
//            const double featureMass = 31267.95;

            var tolerance        = new Tolerance(10);
            var relaxedTolerance = new Tolerance(20);

            const int minTagLength       = 5;
            const int minMergedTagLength = 7;
            const int minNumTagMatches   = 1;

            var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw");
            var run         = PbfLcMsRun.GetLcMsRun(rawFileName);

            var aminoAcidSet    = AminoAcidSet.GetStandardAminoAcidSet();
            var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft");
            var filter          = new Ms1FtFilter(run, tolerance, featureFileName);
            var ms2ScanNums     =
                filter.GetMatchingMs2ScanNums(featureMass)
                .Where(scanNum => scanNum > minScanNum && scanNum < maxScanNum)
                .ToArray();

            const string tagFileName   = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag");
            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb      = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);
            var tagParser    = new SequenceTagParser(tagFileName, minTagLength);

            var proteinsToTags = new Dictionary <string, IList <MatchedTag> >();

            foreach (var ms2ScanNum in ms2ScanNums)
            {
                var tags = tagParser.GetSequenceTags(ms2ScanNum);
                foreach (var tag in tags)
                {
                    var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray();
                    foreach (var index in matchedIndices)
                    {
                        var protein    = fastaDb.GetProteinName(index);
                        var startIndex = fastaDb.GetZeroBasedPositionInProtein(index);
                        var matchedTag = new MatchedTag(tag, startIndex, featureMass);
                        IList <MatchedTag> existingTags;
                        if (proteinsToTags.TryGetValue(protein, out existingTags))
                        {
                            existingTags.Add(matchedTag);
                        }
                        else
                        {
                            proteinsToTags.Add(protein, new List <MatchedTag> {
                                matchedTag
                            });
                        }
                    }
                }
            }

            foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count))
            {
                if (entry.Value.Count < minNumTagMatches)
                {
                    break;
                }
                var proteinName     = entry.Key;
                var proteinSequence = fastaDb.GetProteinSequence(proteinName);
                var protein         = new Sequence(proteinSequence, aminoAcidSet);
                Console.WriteLine(proteinName + "\t" + entry.Value.Count);

                var matchedTagSet = new MatchedTagSet(proteinSequence, aminoAcidSet,
                                                      tolerance, relaxedTolerance);

                Console.WriteLine("********** Before merging");
                foreach (var matchedTag in entry.Value)
                {
                    var seq = proteinSequence.Substring(matchedTag.StartIndex,
                                                        matchedTag.EndIndex - matchedTag.StartIndex);
                    var nTermMass = protein.GetMass(0, matchedTag.StartIndex);
                    var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count);
                    Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}",
                                      (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex,
                                      matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable);

                    matchedTagSet.Add(matchedTag);
                }

                Console.WriteLine("********** After merging");
                foreach (var matchedTag in matchedTagSet.Tags)
                {
                    if (matchedTag.Length < minMergedTagLength)
                    {
                        continue;
                    }
                    var seq = proteinSequence.Substring(matchedTag.StartIndex,
                                                        matchedTag.EndIndex - matchedTag.StartIndex);
                    var nTermMass = protein.GetMass(0, matchedTag.StartIndex);
                    var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count);
                    Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}",
                                      (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex,
                                      matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable);
                }

                break;
            }
        }
示例#2
0
        public void FindProteinDeltaMass()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const string folderPath = @"D:\MassSpecFiles\Glyco\";

            if (!Directory.Exists(folderPath))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, folderPath);
            }

            var fileSet = new string[]
            {
                "User_sample_test_02252015", "User_sample_test_MWCO_02262016", "User_sample_test_SEC_F3_03022105",
                "User_sample_test_SEC_F1_02272015", "User_sample_test_SEC_F2_02282015"
            };
            const string fastaFilePath = folderPath + "ID_003836_DA9CC1E4.fasta";

            for (var i = 0; i < fileSet.Length; i++)
            {
                var datasetName = fileSet[i];
                var tagFilePath = folderPath + datasetName + ".seqtag";
                //var outputFilePath = folderPath + datasetName + ".matchedtag";
                var outputFilePath = folderPath + datasetName + ".dmass";
                var fastaDb        = new FastaDatabase(fastaFilePath);
                var searchableDb   = new SearchableDatabase(fastaDb);

                using (var writer = new StreamWriter(outputFilePath))
                {
                    var isHeader    = true;
                    var nReadSeqTag = 0;

                    Console.WriteLine(@"Reading {0} file", tagFilePath);

                    var nColumn = 0;
                    foreach (var line in File.ReadAllLines(tagFilePath))
                    {
                        if (isHeader)
                        {
                            isHeader = false;
                            nColumn  = line.Split('\t').Length;
                            writer.WriteLine(line + "\t" + "Protein" + "\t" + "DetectedFlankingMass" + "\t" + "ExpectedFlankingMass" + "\t" + "DeltaMass");
                            continue;
                        }

                        var token = line.Split('\t');
                        if (token.Length != nColumn)
                        {
                            continue;
                        }
                        var tag = token[1];
                        //var scan = Convert.ToInt32(token[0]);

                        if (tag.Length < 6)
                        {
                            continue;
                        }

                        var nTerminal            = token[2].Equals("1");
                        var detectedFlankingMass = Double.Parse(token[3]);

                        if (!nTerminal)
                        {
                            detectedFlankingMass -= Composition.H2O.Mass;
                        }

                        nReadSeqTag++;

                        var matchedProteins =
                            searchableDb.FindAllMatchedSequenceIndices(tag)
                            .Select(index => fastaDb.GetProteinName(index))
                            .Distinct().ToArray();

                        if (matchedProteins.Length < 1)
                        {
                            continue;
                        }

                        foreach (var protName in matchedProteins)
                        {
                            var seqStr = fastaDb.GetProteinSequence(protName);
                            var oriSeq = new Sequence(seqStr, AminoAcidSet.GetStandardAminoAcidSet());

                            var startIdx = 0;
                            while (true)
                            {
                                var idx = seqStr.IndexOf(tag, startIdx);

                                if (idx < 0)
                                {
                                    break;          //no matching
                                }
                                //var nClv = (nTerminal) ? idx : seqStr.Length - idx - tag.Length;
                                var nClv = (nTerminal) ? 2 : 1;

                                for (var j = 0; j < nClv; j++)
                                {
                                    var flankComposition = (nTerminal)
                                        ? oriSeq.GetComposition(j, idx)
                                        : oriSeq.GetComposition(idx + tag.Length, oriSeq.Count - j);

                                    var massDiff = (detectedFlankingMass - flankComposition.Mass);
                                    if (massDiff > -500 && massDiff < 2000)
                                    {
                                        //writer.WriteLine(massDiff);
                                        writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", line, protName, detectedFlankingMass, flankComposition.Mass, massDiff);
                                    }

                                    if (massDiff > 2000)
                                    {
                                        break;
                                    }
                                }

                                startIdx = idx + tag.Length;
                            }
                        }
                        //var matchedProteinStr = string.Join(",", matchedProteins);
                        //var massDiffStr = string.Join(",", massDiffList);
                        //writer.WriteLine("{0}\t{1}\t{2}\t{3}", line, matchedProteins.Length, matchedProteinStr, massDiffStr);
                    }

                    Console.WriteLine(@"{0} seq tags are processed", nReadSeqTag);
                }
                Console.WriteLine(@"Done");
            }
        }
示例#3
0
        public void TestGetProteinsWithTagMatchingSingleSpec()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string dataSet = @"H:\Research\Lewy\raw\Lewy_intact_07";
            //            const int scanNum = 5158;
            const int minTagLength     = 7;
            const int minNumTagMatches = 1;
            var       aminoAcidSet     = AminoAcidSet.GetStandardAminoAcidSet();

            const int scanNum = 2;
            // Parse sequence tags
            //const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag");

            const string rawFilePath = "";

            const string fastaFilePath = @"H:\Research\Lewy\ID_004858_0EE8CF61.fasta";

            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb      = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);
            //var tagParser = new SequenceTagParser(tagFileName, minTagLength);
            //var tags = tagParser.GetSequenceTags(scanNum);
            var run       = PbfLcMsRun.GetLcMsRun(rawFilePath);
            var spec      = run.GetSpectrum(scanNum) as ProductSpectrum;
            var tagFinder = new SequenceTagFinder(spec, new Tolerance(5));
            var tags      = tagFinder.GetAllSequenceTagString();

            var proteinsToTags = new Dictionary <string, IList <MatchedTag> >();

            foreach (var tag in tags)
            {
                var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray();
                foreach (var index in matchedIndices)
                {
                    var protein    = fastaDb.GetProteinName(index);
                    var startIndex = fastaDb.GetOneBasedPositionInProtein(index);
                    var matchedTag = new MatchedTag(tag, startIndex, 0.0);
                    IList <MatchedTag> existingTags;
                    if (proteinsToTags.TryGetValue(protein, out existingTags))
                    {
                        existingTags.Add(matchedTag);
                    }
                    else
                    {
                        proteinsToTags.Add(protein, new List <MatchedTag> {
                            matchedTag
                        });
                    }
                }
            }

            foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count))
            {
                if (entry.Value.Count < minNumTagMatches)
                {
                    break;
                }
                var proteinName     = entry.Key;
                var proteinSequence = fastaDb.GetProteinSequence(proteinName);
                var protein         = new Sequence(proteinSequence, aminoAcidSet);
                Console.WriteLine(proteinName + "\t" + entry.Value.Count);
                foreach (var matchedTag in entry.Value)
                {
                    var seq = proteinSequence.Substring(matchedTag.StartIndex,
                                                        matchedTag.EndIndex - matchedTag.StartIndex);
                    var nTermMass = protein.GetMass(0, matchedTag.StartIndex);
                    var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count);
                    Console.WriteLine("\t{0} ({1})\t{2}\t{3} ({4})\t{5}\t{6}\t{7}",
                                      matchedTag.NTermFlankingMass, (matchedTag.NTermFlankingMass - nTermMass),
                                      seq,
                                      matchedTag.CTermFlankingMass, (matchedTag.CTermFlankingMass - cTermMass),
                                      matchedTag.StartIndex,
                                      matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable);
                }
            }
        }