public void TestSearching() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } var db = new FastaDatabase(dbFile); var searchableDb = new SearchableDatabase(db); //const string pattern = "NSGSHFCGGSLINSQWVVSAAH"; const string pattern = "FPTDDDDK"; var position = searchableDb.Search(pattern); Assert.True(position >= 0); Console.WriteLine("Position: {0}", position); Console.WriteLine("Matched indices: {0}", string.Join(",", searchableDb.FindAllMatchedSequenceIndices(pattern))); Console.WriteLine("Protein indices: {0}", string.Join(",", searchableDb.FindAllMatchedSequenceIndices(pattern).Select(i => db.GetOneBasedPositionInProtein(i)))); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
public void FindProteinDeltaMass() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string folderPath = @"D:\MassSpecFiles\Glyco\"; if (!Directory.Exists(folderPath)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, folderPath); } var fileSet = new string[] { "User_sample_test_02252015", "User_sample_test_MWCO_02262016", "User_sample_test_SEC_F3_03022105", "User_sample_test_SEC_F1_02272015", "User_sample_test_SEC_F2_02282015" }; const string fastaFilePath = folderPath + "ID_003836_DA9CC1E4.fasta"; for (var i = 0; i < fileSet.Length; i++) { var datasetName = fileSet[i]; var tagFilePath = folderPath + datasetName + ".seqtag"; //var outputFilePath = folderPath + datasetName + ".matchedtag"; var outputFilePath = folderPath + datasetName + ".dmass"; var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); using (var writer = new StreamWriter(outputFilePath)) { var isHeader = true; var nReadSeqTag = 0; Console.WriteLine(@"Reading {0} file", tagFilePath); var nColumn = 0; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; nColumn = line.Split('\t').Length; writer.WriteLine(line + "\t" + "Protein" + "\t" + "DetectedFlankingMass" + "\t" + "ExpectedFlankingMass" + "\t" + "DeltaMass"); continue; } var token = line.Split('\t'); if (token.Length != nColumn) continue; var tag = token[1]; //var scan = Convert.ToInt32(token[0]); if (tag.Length < 6) continue; var nTerminal = token[2].Equals("1"); var detectedFlankingMass = Double.Parse(token[3]); if (!nTerminal) detectedFlankingMass -= Composition.H2O.Mass; nReadSeqTag++; var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index)) .Distinct().ToArray(); if (matchedProteins.Length < 1) continue; foreach (var protName in matchedProteins) { var seqStr = fastaDb.GetProteinSequence(protName); var oriSeq = new Sequence(seqStr, AminoAcidSet.GetStandardAminoAcidSet()); var startIdx = 0; while (true) { var idx = seqStr.IndexOf(tag, startIdx); if (idx < 0) break; //no matching //var nClv = (nTerminal) ? idx : seqStr.Length - idx - tag.Length; var nClv = (nTerminal) ? 2 : 1; for (var j = 0; j < nClv; j++) { var flankComposition = (nTerminal) ? oriSeq.GetComposition(j, idx) : oriSeq.GetComposition(idx + tag.Length, oriSeq.Count - j); var massDiff = (detectedFlankingMass - flankComposition.Mass); if (massDiff > -500 && massDiff < 2000) { //writer.WriteLine(massDiff); writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", line, protName, detectedFlankingMass, flankComposition.Mass, massDiff); } if (massDiff > 2000) break; } startIdx = idx + tag.Length; } } //var matchedProteinStr = string.Join(",", matchedProteins); //var massDiffStr = string.Join(",", massDiffList); //writer.WriteLine("{0}\t{1}\t{2}\t{3}", line, matchedProteins.Length, matchedProteinStr, massDiffStr); } Console.WriteLine(@"{0} seq tags are processed", nReadSeqTag); } Console.WriteLine(@"Done"); } }
public void CountMatchedScansPerProtein() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const int minTagLength = 6; var proteinToScan = new Dictionary<string, HashSet<int>>(); const string fastaFilePath = @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); Console.WriteLine(@"Sequence length: {0}", fastaDb.GetSequence().Length); //const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv"; //const string tagFilePath = @"\\protoapps\UserData\Jungkap\Co_culture\23B_pellet_TD_3Feb14_Bane_PL011402.seqtag"; const string tagFilePath = @"D:\MassSpecFiles\co_culture\23A_pellet_TD_3Feb14_Bane_PL011402.seqtag"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } var isHeader = true; var numMatchedPairs = 0; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; continue; } var token = line.Split('\t'); if (token.Length != 3) continue; var scan = Convert.ToInt32(token[0]); var tag = token[1]; if (tag.Length < minTagLength) continue; foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index))) { ++numMatchedPairs; HashSet<int> matchedScans; if (proteinToScan.TryGetValue(matchedProtein, out matchedScans)) { matchedScans.Add(scan); } else { matchedScans = new HashSet<int> {scan}; proteinToScan.Add(matchedProtein, matchedScans); } } } var numMatchedProteins = proteinToScan.Keys.Count; var numAllProteins = fastaDb.GetNumEntries(); Console.WriteLine("NumAllProteins: {0}", numAllProteins); Console.WriteLine("NumMatchedProteins: {0}", numMatchedProteins); Console.WriteLine("AvgMatchedScansPerProtein: {0}", numMatchedPairs / (float)numAllProteins); }
public void CountMatchedProteins() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const int minTagLength = 3; var scanToProtein = new Dictionary<int, string>(); var idTag = new Dictionary<int, bool>(); const string resultFilePath = @"H:\Research\ProMex\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } var parser = new TsvFileParser(resultFilePath); var scans = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var proteinNames = parser.GetData("ProteinName").ToArray(); var qValues = parser.GetData("QValue").Select(Convert.ToDouble).ToArray(); for (var i = 0; i < qValues.Length; i++) { if (qValues[i] > 0.01) break; scanToProtein.Add(scans[i], proteinNames[i]); idTag.Add(scans[i], false); } const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } var run = PbfLcMsRun.GetLcMsRun(rawFilePath); const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } // const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta"; // const string fastaFilePath = // @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); Console.WriteLine("Sequence length: {0}", fastaDb.GetSequence().Length); const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.seqtag"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } var hist = new Dictionary<int, int>(); var scanSet = new HashSet<int>(); HashSet<string> proteinSetForThisScan = null; var prevScan = -1; var totalNumMatches = 0L; var isHeader = true; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; continue; } var token = line.Split('\t'); if (token.Length < 3) continue; var scan = Convert.ToInt32(token[0]); var proteinId = scanToProtein.ContainsKey(scan) ? scanToProtein[scan] : null; if (scan != prevScan) { if (proteinSetForThisScan != null) { var numMatches = proteinSetForThisScan.Count; int numOcc; if (hist.TryGetValue(numMatches, out numOcc)) hist[numMatches] = numOcc + 1; else hist.Add(numMatches, 1); } prevScan = scan; proteinSetForThisScan = new HashSet<string>(); } scanSet.Add(scan); var tag = token[1]; if (tag.Length < minTagLength) continue; if (proteinSetForThisScan == null) continue; var numMatchesForThisTag = 0; foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index))) { proteinSetForThisScan.Add(matchedProtein); ++numMatchesForThisTag; if (proteinId != null && matchedProtein.Equals(proteinId)) { idTag[scan] = true; } } totalNumMatches += numMatchesForThisTag; // if (numMatchesForThisTag > 10) // { // Console.WriteLine("{0}\t{1}", tag, numMatchesForThisTag); // } } if (proteinSetForThisScan != null) { var numMatches = proteinSetForThisScan.Count; int numOcc; if (hist.TryGetValue(numMatches, out numOcc)) hist[numMatches] = numOcc + 1; else hist.Add(numMatches, 1); } Console.WriteLine("AvgNumMatches: {0}", totalNumMatches/(float)scanSet.Count); Console.WriteLine("Histogram:"); foreach (var entry in hist.OrderBy(e => e.Key)) { Console.WriteLine("{0}\t{1}", entry.Key, entry.Value); } Console.WriteLine("NumId: {0}", idTag.Count); Console.WriteLine("NumIdByTag: {0}", idTag.Select(e => e.Value).Count(v => v)); }
public void FindProteins() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } const string outputFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_matchedtag.tsv"; using (var writer = new StreamWriter(outputFilePath)) { var isHeader = true; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; writer.WriteLine(line+"\t"+"Proteins"); continue; } var token = line.Split('\t'); if (token.Length != 3) continue; var tag = token[1]; var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index)) .Distinct().ToArray(); var matchedProteinStr = string.Join(",", matchedProteins); writer.WriteLine("{0}\t{1}\t{2}", line, matchedProteins.Length, matchedProteinStr); } } Console.WriteLine(@"Done"); }
public void TestFeatureIdMatching() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string resultFilePath = @"H:\Research\QCShew_TopDown\Production\M1_V092\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } var resultParser = new MsPathFinderParser(resultFilePath); const double qValueThreshold = 0.01; const double tolerancePpm = 13; const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3"; var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw"); if (!File.Exists(rawFileName)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFileName); } var run = PbfLcMsRun.GetLcMsRun(rawFileName); var idList = resultParser.GetIdList().TakeWhile(id => id.QValue <= qValueThreshold).OrderBy(id => id.Mass).ToList(); var idMassList = idList.Select(id => id.Mass).ToList(); var idFlag = new bool[idList.Count]; // Parse sequence tags var tagFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".seqtag"); const int minTagLength = 6; const int numProtMatches = 4; // const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta"; if (!File.Exists(tagFileName)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFileName); } if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); var tagParser = new SequenceTagParser(tagFileName, minTagLength); var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft"); var featureParser = new TsvFileParser(featureFileName); var minScan = featureParser.GetData("MinScan").Select(s => Convert.ToInt32(s)).ToArray(); var maxScan = featureParser.GetData("MaxScan").Select(s => Convert.ToInt32(s)).ToArray(); var minCharge = featureParser.GetData("MinCharge").Select(s => Convert.ToInt32(s)).ToArray(); var maxCharge = featureParser.GetData("MaxCharge").Select(s => Convert.ToInt32(s)).ToArray(); var monoMass = featureParser.GetData("MonoMass").Select(Convert.ToDouble).ToArray(); var numFeaturesWithId = 0; var numFeaturesWithMs2 = 0; var numFeaturesWithTags = 0; var numFeaturesWithMatchingTags = 0; var numFeaturesWithTwoOrMoreMatchingTags = 0; var numFeaturesWithNoIdAndMatchingTags = 0; for (var i = 0; i < featureParser.NumData; i++) { var mass = monoMass[i]; // Find Id var tolDa = new Tolerance(tolerancePpm).GetToleranceAsDa(mass, 1); var minMass = mass - tolDa; var maxMass = mass + tolDa; var index = idMassList.BinarySearch(mass); if (index < 0) index = ~index; var matchedId = new List<MsPathFinderId>(); // go down var curIndex = index - 1; while (curIndex >= 0) { var curId = idList[curIndex]; if (curId.Mass < minMass) break; if (curId.Scan > minScan[i] && curId.Scan < maxScan[i] && curId.Charge >= minCharge[i] && curId.Charge <= maxCharge[i]) { matchedId.Add(curId); idFlag[curIndex] = true; } --curIndex; } // go up curIndex = index; while (curIndex < idList.Count) { var curId = idList[curIndex]; if (curId.Mass > maxMass) break; if (curId.Scan >= minScan[i] && curId.Scan <= maxScan[i] && curId.Charge >= minCharge[i] && curId.Charge <= maxCharge[i]) { matchedId.Add(curId); idFlag[curIndex] = true; } ++curIndex; } var hasId = false; if (matchedId.Any()) { ++numFeaturesWithId; hasId = true; } // Find MS2 scans // var numMs2Scans = 0; var tags = new List<SequenceTag>(); var hasMs2 = false; for (var scanNum = minScan[i]; scanNum <= maxScan[i]; scanNum++) { var isolationWindow = run.GetIsolationWindow(scanNum); if (isolationWindow == null) continue; var isolationWindowTargetMz = isolationWindow.IsolationWindowTargetMz; var charge = (int)Math.Round(mass / isolationWindowTargetMz); if (charge < minCharge[i] || charge > maxCharge[i]) continue; var mz = Ion.GetIsotopeMz(mass, charge, Averagine.GetIsotopomerEnvelope(mass).MostAbundantIsotopeIndex); if (isolationWindow.Contains(mz)) { // ++numMs2Scans; tags.AddRange(tagParser.GetSequenceTags(scanNum)); hasMs2 = true; } } if (hasMs2) ++numFeaturesWithMs2; if (tags.Any()) ++numFeaturesWithTags; var protHist = new Dictionary<string, int>(); var hasMatchedTag = false; foreach (var tag in tags) { var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).Select(idx => fastaDb.GetProteinName(idx)).ToArray(); if (matchedProteins.Any()) { hasMatchedTag = true; foreach (var protein in matchedProteins) { int num; if (protHist.TryGetValue(protein, out num)) protHist[protein] = num + 1; else protHist[protein] = 1; } } } if (hasMatchedTag) { ++numFeaturesWithMatchingTags; if (!hasId) ++numFeaturesWithNoIdAndMatchingTags; } if (protHist.Any()) { var maxOcc = protHist.Values.Max(); if (maxOcc >= numProtMatches) ++numFeaturesWithTwoOrMoreMatchingTags; } } Console.WriteLine("NumFeatures: {0}", featureParser.NumData); Console.WriteLine("NumId: {0}", idList.Count); Console.WriteLine("NumFeaturesWithId: {0} ({1})", numFeaturesWithId, numFeaturesWithId / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithMs2: {0} ({1})", numFeaturesWithMs2, numFeaturesWithMs2 / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithTag: {0} ({1})", numFeaturesWithTags, numFeaturesWithTags / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithMatchedTag: {0} ({1})", numFeaturesWithMatchingTags, numFeaturesWithMatchingTags / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithMoreThanOneMatchedTag: {0} ({1})", numFeaturesWithTwoOrMoreMatchingTags, numFeaturesWithTwoOrMoreMatchingTags / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithNoIdAndMatchedTag: {0} ({1})", numFeaturesWithNoIdAndMatchingTags, numFeaturesWithNoIdAndMatchingTags / (float)featureParser.NumData); for (var i = 0; i < idFlag.Length; i++) { if (!idFlag[i]) { Console.WriteLine(idList[i].Scan); } } // Console.WriteLine(string.Join(",", filter.GetMatchingMs2ScanNums(8115.973001))); // // Console.WriteLine(featureFileName); }
public void TestFeatureId() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3"; if (!File.Exists(dataSet)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dataSet); } // Feature: 5236-5286 6-12 8480.3681 5 const int minScanNum = 5236; const int maxScanNum = 5286; const double featureMass = 8480.3681; //const int minScanNum = 7251; //const int maxScanNum = 7326; //const double featureMass = 32347.18; // const int minScanNum = 4451; // const int maxScanNum = 4541; // const double featureMass = 31267.95; var tolerance = new Tolerance(10); var relaxedTolerance = new Tolerance(20); const int minTagLength = 5; const int minMergedTagLength = 7; const int minNumTagMatches = 1; var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw"); var run = PbfLcMsRun.GetLcMsRun(rawFileName); var aminoAcidSet = AminoAcidSet.GetStandardAminoAcidSet(); var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft"); var filter = new Ms1FtFilter(run, tolerance, featureFileName); var ms2ScanNums = filter.GetMatchingMs2ScanNums(featureMass) .Where(scanNum => scanNum > minScanNum && scanNum < maxScanNum) .ToArray(); const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag"); const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); var tagParser = new SequenceTagParser(tagFileName, minTagLength); var proteinsToTags = new Dictionary<string, IList<MatchedTag>>(); foreach (var ms2ScanNum in ms2ScanNums) { var tags = tagParser.GetSequenceTags(ms2ScanNum); foreach (var tag in tags) { var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray(); foreach (var index in matchedIndices) { var protein = fastaDb.GetProteinName(index); var startIndex = fastaDb.GetZeroBasedPositionInProtein(index); var matchedTag = new MatchedTag(tag, startIndex, featureMass); IList<MatchedTag> existingTags; if (proteinsToTags.TryGetValue(protein, out existingTags)) { existingTags.Add(matchedTag); } else { proteinsToTags.Add(protein, new List<MatchedTag> { matchedTag }); } } } } foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count)) { if (entry.Value.Count < minNumTagMatches) break; var proteinName = entry.Key; var proteinSequence = fastaDb.GetProteinSequence(proteinName); var protein = new Sequence(proteinSequence, aminoAcidSet); Console.WriteLine(proteinName + "\t" + entry.Value.Count); var matchedTagSet = new MatchedTagSet(proteinSequence, aminoAcidSet, tolerance, relaxedTolerance); Console.WriteLine("********** Before merging"); foreach (var matchedTag in entry.Value) { var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}", (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); matchedTagSet.Add(matchedTag); } Console.WriteLine("********** After merging"); foreach (var matchedTag in matchedTagSet.Tags) { if (matchedTag.Length < minMergedTagLength) continue; var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}", (matchedTag.NTermFlankingMass-nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); } break; } }
public void TestTagMatchingSingleSpec() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3"; const int scanNum = 4533; // Parse sequence tags var tagFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".seqtag"); const int minTagLength = 8; const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(tagFileName)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFileName); } if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); var tagParser = new SequenceTagParser(tagFileName, minTagLength); var tags = tagParser.GetSequenceTags(scanNum); foreach (var tag in tags) { var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence) .Select(index => fastaDb.GetProteinName(index)).ToArray(); if (matchedProteins.Any()) { Console.WriteLine("{0}\t{1}\t{2}\t{3}", tag.Sequence, tag.IsPrefix, tag.FlankingMass, string.Join("\t", matchedProteins)); } } }
public void TestGetProteinsWithTagMatchingSingleSpec() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataSet = @"H:\Research\Lewy\raw\Lewy_intact_07"; // const int scanNum = 5158; const int minTagLength = 7; const int minNumTagMatches = 1; var aminoAcidSet = AminoAcidSet.GetStandardAminoAcidSet(); const int scanNum = 2; // Parse sequence tags //const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag"); const string rawFilePath = ""; const string fastaFilePath = @"H:\Research\Lewy\ID_004858_0EE8CF61.fasta"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); //var tagParser = new SequenceTagParser(tagFileName, minTagLength); //var tags = tagParser.GetSequenceTags(scanNum); var run = PbfLcMsRun.GetLcMsRun(rawFilePath); var spec = run.GetSpectrum(scanNum) as ProductSpectrum; var tagFinder = new SequenceTagFinder(spec, new Tolerance(5)); var tags = tagFinder.GetAllSequenceTagString(); var proteinsToTags = new Dictionary<string, IList<MatchedTag>>(); foreach (var tag in tags) { var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray(); foreach (var index in matchedIndices) { var protein = fastaDb.GetProteinName(index); var startIndex = fastaDb.GetOneBasedPositionInProtein(index); var matchedTag = new MatchedTag(tag, startIndex, 0.0); IList<MatchedTag> existingTags; if (proteinsToTags.TryGetValue(protein, out existingTags)) { existingTags.Add(matchedTag); } else { proteinsToTags.Add(protein, new List<MatchedTag> { matchedTag }); } } } foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count)) { if (entry.Value.Count < minNumTagMatches) break; var proteinName = entry.Key; var proteinSequence = fastaDb.GetProteinSequence(proteinName); var protein = new Sequence(proteinSequence, aminoAcidSet); Console.WriteLine(proteinName + "\t" + entry.Value.Count); foreach (var matchedTag in entry.Value) { var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0} ({1})\t{2}\t{3} ({4})\t{5}\t{6}\t{7}", matchedTag.NTermFlankingMass, (matchedTag.NTermFlankingMass - nTermMass), seq, matchedTag.CTermFlankingMass, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); } } }
public static Dictionary<string, MatchedTagSet> GetProteinToMatchedTagsMap( IEnumerable<SequenceTag> tags, SearchableDatabase searchableDb, AminoAcidSet aaSet, Tolerance tolerance, Tolerance relaxedTolerance) { var fastaDb = searchableDb.FastaDatabase; var proteinsToTags = new Dictionary<string, MatchedTagSet>(); foreach (var tag in tags) { var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray(); if (matchedIndices.Length > MaxNumProteinMatchesPerTag) continue; foreach (var index in matchedIndices) { var proteinName = fastaDb.GetProteinName(index); var startIndex = fastaDb.GetZeroBasedPositionInProtein(index); var mass = aaSet.GetComposition(tag.Sequence).Mass; var matchedTag = new MatchedTag(tag, startIndex) { Mass = mass }; MatchedTagSet existingMatchedTagSet; if (proteinsToTags.TryGetValue(proteinName, out existingMatchedTagSet)) { existingMatchedTagSet.Add(matchedTag); } else { var proteinSequence = fastaDb.GetProteinSequence(proteinName); var matchedTagSet = new MatchedTagSet(proteinSequence, aaSet, tolerance, relaxedTolerance); matchedTagSet.Add(matchedTag); proteinsToTags.Add(proteinName, matchedTagSet); } } } return proteinsToTags; }