private IEnumerable <FlankingMassMatch> GetBackwardMatches( MatchedTag matchedTag, ShiftedSequenceGraph backwardGraph, double?featureMass = null ) { for (var j = matchedTag.StartIndex - 1; j >= -1; j--) { var residue = j >= 0 ? _proteinSequence[j] : AminoAcid.ProteinNTerm.Residue; var location = j > 0 ? SequenceLocation.Everywhere : SequenceLocation.ProteinNTerm; if (!backwardGraph.AddAminoAcid(residue, location)) { yield break; } if (j == 0) { continue; } var backwardMatch = GetBestMatchInTheGraph(backwardGraph, _spec, featureMass); if (backwardMatch != null) { backwardMatch.Index = Math.Max(j, 0); yield return(backwardMatch); } } }
// private readonly int _minProductIonCharge; // private readonly int _maxProductIonCharge; private IEnumerable <FlankingMassMatch> GetForwardMatches( MatchedTag matchedTag, ShiftedSequenceGraph forwardGraph, double?featureMass = null ) { for (var i = matchedTag.EndIndex; i <= _proteinSequence.Length; i++) { var residue = i < _proteinSequence.Length ? _proteinSequence[i] : AminoAcid.ProteinCTerm.Residue; var location = i < _proteinSequence.Length - 1 ? SequenceLocation.Everywhere : SequenceLocation.ProteinCTerm; if (!forwardGraph.AddAminoAcid(residue, location)) { yield break; } if (i == _proteinSequence.Length - 1) { continue; } var forwardMatch = GetBestMatchInTheGraph(forwardGraph, _spec, featureMass); if (forwardMatch != null) { forwardMatch.Index = Math.Min(i + 1, _proteinSequence.Length); yield return(forwardMatch); } } }
private IEnumerable <TagMatch> FindMatchesWithFeatureMass(MatchedTag matchedTag) { if (matchedTag.NTermFlankingMass == null || matchedTag.CTermFlankingMass == null) { yield break; } var featureMass = (double)matchedTag.NTermFlankingMass + matchedTag.Mass + (double)matchedTag.CTermFlankingMass + Composition.H2O.Mass; var shiftMass = matchedTag.Mass + (double)matchedTag.NTermFlankingMass; var backwardGraph = new ShiftedSequenceGraph(_aaSet, shiftMass, false, matchedTag.StartIndex, featureMass - MinSumModificationMasses); foreach (var backwardMatch in GetBackwardMatches(matchedTag, backwardGraph, featureMass)) { // Make a forward graph var nTermShiftMass = backwardMatch.Mass + matchedTag.Mass; var forwardGraph = new ShiftedSequenceGraph(_aaSet, nTermShiftMass, true, _proteinSequence.Length - matchedTag.EndIndex, featureMass - MinSumModificationMasses); foreach ( var forwardMatch in GetForwardMatches(matchedTag, forwardGraph, featureMass)) { var mass = forwardMatch.Mass + matchedTag.Mass + backwardMatch.Mass; if (mass > _maxSequenceMass) { continue; } var offset = matchedTag.EndIndex - backwardMatch.Index - 1; var modStr = string.Join(",", backwardMatch.Modifications.Concat(forwardMatch.Modifications.Select(m => m.GetModificationInstanceWithOffset(offset)))); var modList = new List <Modification>(); foreach (var mod in backwardMatch.Modifications) { modList.Add(mod.Modification); } foreach (var mod in forwardMatch.Modifications) { modList.Add(mod.Modification); } var tagMatch = new TagMatch( backwardMatch.Index, forwardMatch.Index, matchedTag.Length, backwardMatch.Charge, backwardMatch.Score, forwardMatch.Score, mass, new ModificationCombination(modList), modStr); yield return(tagMatch); } } }
public static Dictionary <string, MatchedTagSet> GetProteinToMatchedTagsMap( IEnumerable <SequenceTag.SequenceTag> tags, SearchableDatabase searchableDb, AminoAcidSet aaSet, Tolerance tolerance, Tolerance relaxedTolerance) { var fastaDb = searchableDb.FastaDatabase; var proteinsToTags = new Dictionary <string, MatchedTagSet>(); foreach (var tag in tags) { var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray(); if (matchedIndices.Length > MaxNumProteinMatchesPerTag) { continue; } foreach (var index in matchedIndices) { var proteinName = fastaDb.GetProteinName(index); var startIndex = fastaDb.GetZeroBasedPositionInProtein(index); var mass = aaSet.GetComposition(tag.Sequence).Mass; var matchedTag = new MatchedTag(tag, startIndex) { Mass = mass }; MatchedTagSet existingMatchedTagSet; if (proteinsToTags.TryGetValue(proteinName, out existingMatchedTagSet)) { existingMatchedTagSet.Add(matchedTag); } else { var proteinSequence = fastaDb.GetProteinSequence(proteinName); if (proteinSequence == null) { proteinSequence = proteinName; } var matchedTagSet = new MatchedTagSet(proteinSequence, aaSet, tolerance, relaxedTolerance); matchedTagSet.Add(matchedTag); proteinsToTags.Add(proteinName, matchedTagSet); } } } return(proteinsToTags); }
public IEnumerable <TagMatch> FindMatches(MatchedTag matchedTag) { if (matchedTag.NTermFlankingMass != null && matchedTag.CTermFlankingMass != null) { return(FindMatchesWithFeatureMass(matchedTag)); } if (matchedTag.NTermFlankingMass != null) { return(FindMatchesForwardAndBackward(matchedTag)); } if (matchedTag.CTermFlankingMass != null) { return(FindMatchesBackwardAndForward(matchedTag)); } return(Enumerable.Empty <TagMatch>()); }
public void TestFeatureId() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3"; if (!File.Exists(dataSet)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dataSet); } // Feature: 5236-5286 6-12 8480.3681 5 const int minScanNum = 5236; const int maxScanNum = 5286; const double featureMass = 8480.3681; //const int minScanNum = 7251; //const int maxScanNum = 7326; //const double featureMass = 32347.18; // const int minScanNum = 4451; // const int maxScanNum = 4541; // const double featureMass = 31267.95; var tolerance = new Tolerance(10); var relaxedTolerance = new Tolerance(20); const int minTagLength = 5; const int minMergedTagLength = 7; const int minNumTagMatches = 1; var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw"); var run = PbfLcMsRun.GetLcMsRun(rawFileName); var aminoAcidSet = AminoAcidSet.GetStandardAminoAcidSet(); var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft"); var filter = new Ms1FtFilter(run, tolerance, featureFileName); var ms2ScanNums = filter.GetMatchingMs2ScanNums(featureMass) .Where(scanNum => scanNum > minScanNum && scanNum < maxScanNum) .ToArray(); const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag"); const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); var tagParser = new SequenceTagParser(tagFileName, minTagLength); var proteinsToTags = new Dictionary <string, IList <MatchedTag> >(); foreach (var ms2ScanNum in ms2ScanNums) { var tags = tagParser.GetSequenceTags(ms2ScanNum); foreach (var tag in tags) { var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray(); foreach (var index in matchedIndices) { var protein = fastaDb.GetProteinName(index); var startIndex = fastaDb.GetZeroBasedPositionInProtein(index); var matchedTag = new MatchedTag(tag, startIndex, featureMass); IList <MatchedTag> existingTags; if (proteinsToTags.TryGetValue(protein, out existingTags)) { existingTags.Add(matchedTag); } else { proteinsToTags.Add(protein, new List <MatchedTag> { matchedTag }); } } } } foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count)) { if (entry.Value.Count < minNumTagMatches) { break; } var proteinName = entry.Key; var proteinSequence = fastaDb.GetProteinSequence(proteinName); var protein = new Sequence(proteinSequence, aminoAcidSet); Console.WriteLine(proteinName + "\t" + entry.Value.Count); var matchedTagSet = new MatchedTagSet(proteinSequence, aminoAcidSet, tolerance, relaxedTolerance); Console.WriteLine("********** Before merging"); foreach (var matchedTag in entry.Value) { var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}", (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); matchedTagSet.Add(matchedTag); } Console.WriteLine("********** After merging"); foreach (var matchedTag in matchedTagSet.Tags) { if (matchedTag.Length < minMergedTagLength) { continue; } var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}", (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); } break; } }
public void TestGetProteinsWithTagMatchingSingleSpec() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataSet = @"H:\Research\Lewy\raw\Lewy_intact_07"; // const int scanNum = 5158; const int minTagLength = 7; const int minNumTagMatches = 1; var aminoAcidSet = AminoAcidSet.GetStandardAminoAcidSet(); const int scanNum = 2; // Parse sequence tags //const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag"); const string rawFilePath = ""; const string fastaFilePath = @"H:\Research\Lewy\ID_004858_0EE8CF61.fasta"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); //var tagParser = new SequenceTagParser(tagFileName, minTagLength); //var tags = tagParser.GetSequenceTags(scanNum); var run = PbfLcMsRun.GetLcMsRun(rawFilePath); var spec = run.GetSpectrum(scanNum) as ProductSpectrum; var tagFinder = new SequenceTagFinder(spec, new Tolerance(5)); var tags = tagFinder.GetAllSequenceTagString(); var proteinsToTags = new Dictionary <string, IList <MatchedTag> >(); foreach (var tag in tags) { var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray(); foreach (var index in matchedIndices) { var protein = fastaDb.GetProteinName(index); var startIndex = fastaDb.GetOneBasedPositionInProtein(index); var matchedTag = new MatchedTag(tag, startIndex, 0.0); IList <MatchedTag> existingTags; if (proteinsToTags.TryGetValue(protein, out existingTags)) { existingTags.Add(matchedTag); } else { proteinsToTags.Add(protein, new List <MatchedTag> { matchedTag }); } } } foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count)) { if (entry.Value.Count < minNumTagMatches) { break; } var proteinName = entry.Key; var proteinSequence = fastaDb.GetProteinSequence(proteinName); var protein = new Sequence(proteinSequence, aminoAcidSet); Console.WriteLine(proteinName + "\t" + entry.Value.Count); foreach (var matchedTag in entry.Value) { var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0} ({1})\t{2}\t{3} ({4})\t{5}\t{6}\t{7}", matchedTag.NTermFlankingMass, (matchedTag.NTermFlankingMass - nTermMass), seq, matchedTag.CTermFlankingMass, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); } } }