Пример #1
0
        public void TestFeatureId()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3";

            if (!File.Exists(dataSet))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dataSet);
            }

            // Feature: 5236-5286	6-12	8480.3681	5
            const int minScanNum = 5236;
            const int maxScanNum = 5286;
            const double featureMass = 8480.3681;

            //const int minScanNum = 7251;
            //const int maxScanNum = 7326;
            //const double featureMass = 32347.18;

//            const int minScanNum = 4451;
//            const int maxScanNum = 4541;
//            const double featureMass = 31267.95;

            var tolerance = new Tolerance(10);
            var relaxedTolerance = new Tolerance(20);

            const int minTagLength = 5;
            const int minMergedTagLength = 7;
            const int minNumTagMatches = 1;

            var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw");
            var run = PbfLcMsRun.GetLcMsRun(rawFileName);

            var aminoAcidSet = AminoAcidSet.GetStandardAminoAcidSet();
            var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft");
            var filter = new Ms1FtFilter(run, tolerance, featureFileName);
            var ms2ScanNums =
                filter.GetMatchingMs2ScanNums(featureMass)
                    .Where(scanNum => scanNum > minScanNum && scanNum < maxScanNum)
                    .ToArray();

            const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag");
            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);
            var tagParser = new SequenceTagParser(tagFileName, minTagLength);

            var proteinsToTags = new Dictionary<string, IList<MatchedTag>>();
            foreach (var ms2ScanNum in ms2ScanNums)
            {
                var tags = tagParser.GetSequenceTags(ms2ScanNum);
                foreach (var tag in tags)
                {
                    var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray();
                    foreach (var index in matchedIndices)
                    {
                        var protein = fastaDb.GetProteinName(index);
                        var startIndex = fastaDb.GetZeroBasedPositionInProtein(index);
                        var matchedTag = new MatchedTag(tag, startIndex, featureMass);
                        IList<MatchedTag> existingTags;
                        if (proteinsToTags.TryGetValue(protein, out existingTags))
                        {
                            existingTags.Add(matchedTag);
                        }
                        else
                        {
                            proteinsToTags.Add(protein, new List<MatchedTag> { matchedTag });
                        }
                    }
                }
            }

            foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count))
            {
                if (entry.Value.Count < minNumTagMatches) break;
                var proteinName = entry.Key;
                var proteinSequence = fastaDb.GetProteinSequence(proteinName);
                var protein = new Sequence(proteinSequence, aminoAcidSet);
                Console.WriteLine(proteinName + "\t" + entry.Value.Count);

                var matchedTagSet = new MatchedTagSet(proteinSequence, aminoAcidSet,
                    tolerance, relaxedTolerance);

                Console.WriteLine("********** Before merging");
                foreach (var matchedTag in entry.Value)
                {
                    var seq = proteinSequence.Substring(matchedTag.StartIndex,
                        matchedTag.EndIndex - matchedTag.StartIndex);
                    var nTermMass = protein.GetMass(0, matchedTag.StartIndex);
                    var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count);
                    Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}",
                        (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex,
                        matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable);

                    matchedTagSet.Add(matchedTag);
                }

                Console.WriteLine("********** After merging");
                foreach (var matchedTag in matchedTagSet.Tags)
                {
                    if (matchedTag.Length < minMergedTagLength) continue;
                    var seq = proteinSequence.Substring(matchedTag.StartIndex,
                        matchedTag.EndIndex - matchedTag.StartIndex);
                    var nTermMass = protein.GetMass(0, matchedTag.StartIndex);
                    var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count);
                    Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}",
                        (matchedTag.NTermFlankingMass-nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex,
                        matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable);
                }

                break;
            }
        }
Пример #2
0
        public static Dictionary<string, MatchedTagSet> GetProteinToMatchedTagsMap(
            IEnumerable<SequenceTag> tags, 
            SearchableDatabase searchableDb, 
            AminoAcidSet aaSet, 
            Tolerance tolerance,
            Tolerance relaxedTolerance)
        {
            var fastaDb = searchableDb.FastaDatabase;
            var proteinsToTags = new Dictionary<string, MatchedTagSet>();
            foreach (var tag in tags)
            {
                var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray();
                if (matchedIndices.Length > MaxNumProteinMatchesPerTag) continue;
                foreach (var index in matchedIndices)
                {
                    var proteinName = fastaDb.GetProteinName(index);
                    var startIndex = fastaDb.GetZeroBasedPositionInProtein(index);
                    var mass = aaSet.GetComposition(tag.Sequence).Mass;
                    var matchedTag = new MatchedTag(tag, startIndex) { Mass = mass };
                    MatchedTagSet existingMatchedTagSet;
                    if (proteinsToTags.TryGetValue(proteinName, out existingMatchedTagSet))
                    {
                        existingMatchedTagSet.Add(matchedTag);
                    }
                    else
                    {
                        var proteinSequence = fastaDb.GetProteinSequence(proteinName);
                        var matchedTagSet = new MatchedTagSet(proteinSequence, aaSet, tolerance, relaxedTolerance);
                        matchedTagSet.Add(matchedTag);
                        proteinsToTags.Add(proteinName, matchedTagSet);
                    }
                }
            }

            return proteinsToTags;
        }