public void FindProteinDeltaMass() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string folderPath = @"D:\MassSpecFiles\Glyco\"; if (!Directory.Exists(folderPath)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, folderPath); } var fileSet = new string[] { "User_sample_test_02252015", "User_sample_test_MWCO_02262016", "User_sample_test_SEC_F3_03022105", "User_sample_test_SEC_F1_02272015", "User_sample_test_SEC_F2_02282015" }; const string fastaFilePath = folderPath + "ID_003836_DA9CC1E4.fasta"; for (var i = 0; i < fileSet.Length; i++) { var datasetName = fileSet[i]; var tagFilePath = folderPath + datasetName + ".seqtag"; //var outputFilePath = folderPath + datasetName + ".matchedtag"; var outputFilePath = folderPath + datasetName + ".dmass"; var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); using (var writer = new StreamWriter(outputFilePath)) { var isHeader = true; var nReadSeqTag = 0; Console.WriteLine(@"Reading {0} file", tagFilePath); var nColumn = 0; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; nColumn = line.Split('\t').Length; writer.WriteLine(line + "\t" + "Protein" + "\t" + "DetectedFlankingMass" + "\t" + "ExpectedFlankingMass" + "\t" + "DeltaMass"); continue; } var token = line.Split('\t'); if (token.Length != nColumn) continue; var tag = token[1]; //var scan = Convert.ToInt32(token[0]); if (tag.Length < 6) continue; var nTerminal = token[2].Equals("1"); var detectedFlankingMass = Double.Parse(token[3]); if (!nTerminal) detectedFlankingMass -= Composition.H2O.Mass; nReadSeqTag++; var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index)) .Distinct().ToArray(); if (matchedProteins.Length < 1) continue; foreach (var protName in matchedProteins) { var seqStr = fastaDb.GetProteinSequence(protName); var oriSeq = new Sequence(seqStr, AminoAcidSet.GetStandardAminoAcidSet()); var startIdx = 0; while (true) { var idx = seqStr.IndexOf(tag, startIdx); if (idx < 0) break; //no matching //var nClv = (nTerminal) ? idx : seqStr.Length - idx - tag.Length; var nClv = (nTerminal) ? 2 : 1; for (var j = 0; j < nClv; j++) { var flankComposition = (nTerminal) ? oriSeq.GetComposition(j, idx) : oriSeq.GetComposition(idx + tag.Length, oriSeq.Count - j); var massDiff = (detectedFlankingMass - flankComposition.Mass); if (massDiff > -500 && massDiff < 2000) { //writer.WriteLine(massDiff); writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", line, protName, detectedFlankingMass, flankComposition.Mass, massDiff); } if (massDiff > 2000) break; } startIdx = idx + tag.Length; } } //var matchedProteinStr = string.Join(",", matchedProteins); //var massDiffStr = string.Join(",", massDiffList); //writer.WriteLine("{0}\t{1}\t{2}\t{3}", line, matchedProteins.Length, matchedProteinStr, massDiffStr); } Console.WriteLine(@"{0} seq tags are processed", nReadSeqTag); } Console.WriteLine(@"Done"); } }
private void WriteResultsToFile(SortedSet<DatabaseSequenceSpectrumMatch>[] matches, string outputFilePath, FastaDatabase database) { using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" + "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tIcScore" ); for (var scanNum = _run.MinLcScan; scanNum <= _run.MaxLcScan; scanNum++) { if (matches[scanNum] == null) continue; foreach (var match in matches[scanNum].Reverse()) { var sequence = match.Sequence; var offset = match.Offset; var start = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages; var end = start + sequence.Length - 1; var proteinName = database.GetProteinName(match.Offset); var protLength = database.GetProteinLength(proteinName); var ion = match.Ion; var scores = _bottomUpScorer.GetScores(match, ion.Composition, ion.Charge, scanNum); if (ion == null) { Console.WriteLine(@"Null ion!"); } if (scores == null) { Console.WriteLine(@"Null scores"); } // Note for DblToString(value, 9, true), by having "9" and "true", // values between 100 and 999 Da will have 7 digits after the decimal place, and // values between 1000 and 9999 will have 6 digits after the decimal place writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}", scanNum, match.Pre, sequence, // Sequence match.Post, scores.Modifications, // Modifications ion.Composition, // Composition proteinName, // ProteinName database.GetProteinDescription(match.Offset), // ProteinDescription protLength, // ProteinLength start, // Start end, // End ion.Charge, // precursorCharge StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true), // MostAbundantIsotopeMz StringUtilities.DblToString(ion.Composition.Mass, 9, true), // Mass match.Score, scores.Score // Score (re-scored) ); } } } }
public void CountMatchedProteins() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const int minTagLength = 3; var scanToProtein = new Dictionary<int, string>(); var idTag = new Dictionary<int, bool>(); const string resultFilePath = @"H:\Research\ProMex\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } var parser = new TsvFileParser(resultFilePath); var scans = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var proteinNames = parser.GetData("ProteinName").ToArray(); var qValues = parser.GetData("QValue").Select(Convert.ToDouble).ToArray(); for (var i = 0; i < qValues.Length; i++) { if (qValues[i] > 0.01) break; scanToProtein.Add(scans[i], proteinNames[i]); idTag.Add(scans[i], false); } const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } var run = PbfLcMsRun.GetLcMsRun(rawFilePath); const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } // const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta"; // const string fastaFilePath = // @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); Console.WriteLine("Sequence length: {0}", fastaDb.GetSequence().Length); const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.seqtag"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } var hist = new Dictionary<int, int>(); var scanSet = new HashSet<int>(); HashSet<string> proteinSetForThisScan = null; var prevScan = -1; var totalNumMatches = 0L; var isHeader = true; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; continue; } var token = line.Split('\t'); if (token.Length < 3) continue; var scan = Convert.ToInt32(token[0]); var proteinId = scanToProtein.ContainsKey(scan) ? scanToProtein[scan] : null; if (scan != prevScan) { if (proteinSetForThisScan != null) { var numMatches = proteinSetForThisScan.Count; int numOcc; if (hist.TryGetValue(numMatches, out numOcc)) hist[numMatches] = numOcc + 1; else hist.Add(numMatches, 1); } prevScan = scan; proteinSetForThisScan = new HashSet<string>(); } scanSet.Add(scan); var tag = token[1]; if (tag.Length < minTagLength) continue; if (proteinSetForThisScan == null) continue; var numMatchesForThisTag = 0; foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index))) { proteinSetForThisScan.Add(matchedProtein); ++numMatchesForThisTag; if (proteinId != null && matchedProtein.Equals(proteinId)) { idTag[scan] = true; } } totalNumMatches += numMatchesForThisTag; // if (numMatchesForThisTag > 10) // { // Console.WriteLine("{0}\t{1}", tag, numMatchesForThisTag); // } } if (proteinSetForThisScan != null) { var numMatches = proteinSetForThisScan.Count; int numOcc; if (hist.TryGetValue(numMatches, out numOcc)) hist[numMatches] = numOcc + 1; else hist.Add(numMatches, 1); } Console.WriteLine("AvgNumMatches: {0}", totalNumMatches/(float)scanSet.Count); Console.WriteLine("Histogram:"); foreach (var entry in hist.OrderBy(e => e.Key)) { Console.WriteLine("{0}\t{1}", entry.Key, entry.Value); } Console.WriteLine("NumId: {0}", idTag.Count); Console.WriteLine("NumIdByTag: {0}", idTag.Select(e => e.Value).Count(v => v)); }
public void CountMatchedScansPerProtein() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const int minTagLength = 6; var proteinToScan = new Dictionary<string, HashSet<int>>(); const string fastaFilePath = @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); Console.WriteLine(@"Sequence length: {0}", fastaDb.GetSequence().Length); //const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv"; //const string tagFilePath = @"\\protoapps\UserData\Jungkap\Co_culture\23B_pellet_TD_3Feb14_Bane_PL011402.seqtag"; const string tagFilePath = @"D:\MassSpecFiles\co_culture\23A_pellet_TD_3Feb14_Bane_PL011402.seqtag"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } var isHeader = true; var numMatchedPairs = 0; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; continue; } var token = line.Split('\t'); if (token.Length != 3) continue; var scan = Convert.ToInt32(token[0]); var tag = token[1]; if (tag.Length < minTagLength) continue; foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index))) { ++numMatchedPairs; HashSet<int> matchedScans; if (proteinToScan.TryGetValue(matchedProtein, out matchedScans)) { matchedScans.Add(scan); } else { matchedScans = new HashSet<int> {scan}; proteinToScan.Add(matchedProtein, matchedScans); } } } var numMatchedProteins = proteinToScan.Keys.Count; var numAllProteins = fastaDb.GetNumEntries(); Console.WriteLine("NumAllProteins: {0}", numAllProteins); Console.WriteLine("NumMatchedProteins: {0}", numMatchedProteins); Console.WriteLine("AvgMatchedScansPerProtein: {0}", numMatchedPairs / (float)numAllProteins); }
private void WriteResultsToFile(DatabaseSequenceSpectrumMatch[] matches, string outputFilePath, FastaDatabase database) { using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" + "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tProbability\tSpecEValue\tEValue"); foreach(var scanNum in _ms2ScanNums) { var match = matches[scanNum]; if (match == null) continue; var sequence = match.Sequence; var offset = match.Offset; var start = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages; var end = start + sequence.Length - 1; var proteinName = database.GetProteinName(match.Offset); var protLength = database.GetProteinLength(proteinName); var ion = match.Ion; var proteinDescription = database.GetProteinDescription(match.Offset); var probability = CompositeScorer.GetProbability(match.Score); // Note for DblToString(value, 9, true), by having "9" and "true", // values between 100 and 999 Da will have 7 digits after the decimal place, and // values between 1000 and 9999 will have 6 digits after the decimal place writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\t{16}\t{17}", scanNum, match.Pre, // Pre sequence, // Sequence match.Post, // Post match.ModificationText, // Modifications ion.Composition, // Composition proteinName, // ProteinName proteinDescription, // ProteinDescription protLength, // ProteinLength start, // Start position in protein end, // End position in protein ion.Charge, // precursorCharge StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true), // MostAbundantIsotopeMz StringUtilities.DblToString(ion.Composition.Mass, 9, true), // Mass match.NumMatchedFragments, // (Number of matched fragments) StringUtilities.DblToString(probability, 4), // Probability StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue), 6, true, 0.001), // EValue; will be displayed using scientific notation if the value is less than 0.001 StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue * database.GetNumEntries()), 6, true, 0.001) // SpecEValue; will be displayed using scientific notation if the value is less than 0.001 ); } } }
public void FindProteins() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } const string outputFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_matchedtag.tsv"; using (var writer = new StreamWriter(outputFilePath)) { var isHeader = true; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; writer.WriteLine(line+"\t"+"Proteins"); continue; } var token = line.Split('\t'); if (token.Length != 3) continue; var tag = token[1]; var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index)) .Distinct().ToArray(); var matchedProteinStr = string.Join(",", matchedProteins); writer.WriteLine("{0}\t{1}\t{2}", line, matchedProteins.Length, matchedProteinStr); } } Console.WriteLine(@"Done"); }
public void TestGettingProteinLengthAndPosition() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } var db = new FastaDatabase(dbFile); db.Read(); var indexedDb = new IndexedDatabase(db); foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsets(6, 20, 2, 0, Enzyme.Trypsin)) { var annotation = peptideAnnotationAndOffset.Annotation; var offset = peptideAnnotationAndOffset.Offset; Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", annotation, offset, db.GetProteinName(offset), db.GetProteinLength(db.GetProteinName(offset)), db.GetOneBasedPositionInProtein(offset)+1); } }
public void TestId() { const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; // const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta"; //const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\Decoy_SO4280.fasta"; const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\SO2312.fasta"; const string modFilePath = @"H:\Research\QCShew_TopDown\Production\Mods.txt"; const int numBits = 29; // max error: 4ppm const int minCharge = 2; const int maxCharge = 20; var tolerance = new Tolerance(10); const double corrThreshold = 0.7; var comparer = new MzComparerWithBinning(numBits); const double minFragmentMass = 200.0; const double maxFragmentMass = 50000.0; var minFragMassBin = comparer.GetBinNumber(minFragmentMass); var maxFragMassBin = comparer.GetBinNumber(maxFragmentMass); var aminoAcidSet = new AminoAcidSet(modFilePath); var run = PbfLcMsRun.GetLcMsRun(rawFilePath); // var ms2ScanNumArr = run.GetScanNumbers(2).ToArray(); //var ms2ScanNumArr = new[] {4130}; var ms2ScanNumArr = new[] { 5189 }; var sw = new Stopwatch(); sw.Start(); Console.Write("Building Spectrum Arrays..."); var massVectors = new BitArray[maxFragMassBin - minFragMassBin + 1]; for (var i = minFragMassBin; i <= maxFragMassBin; i++) { massVectors[i - minFragMassBin] = new BitArray(run.MaxLcScan + 1); } foreach (var ms2ScanNum in ms2ScanNumArr) { var productSpec = run.GetSpectrum(ms2ScanNum) as ProductSpectrum; if (productSpec == null) continue; productSpec.FilterNoise(); var deconvolutedPeaks = Deconvoluter.GetDeconvolutedPeaks(productSpec, minCharge, maxCharge, 2, 1.1, tolerance, corrThreshold); if (deconvolutedPeaks == null) continue; foreach (var p in deconvolutedPeaks) { var mass = p.Mass; var deltaMass = tolerance.GetToleranceAsDa(mass, 1); var minMass = mass - deltaMass; var maxMass = mass + deltaMass; var minBinNum = comparer.GetBinNumber(minMass); var maxBinNum = comparer.GetBinNumber(maxMass); for (var binNum = minBinNum; binNum <= maxBinNum; binNum++) { if (binNum >= minFragMassBin && binNum <= maxFragMassBin) massVectors[binNum - minFragMassBin][ms2ScanNum] = true; } } } sw.Stop(); Console.WriteLine(@"{0:f4} sec.", sw.Elapsed.TotalSeconds); sw.Reset(); sw.Start(); var fastaDb = new FastaDatabase(fastaFilePath); fastaDb.Read(); var indexedDb = new IndexedDatabase(fastaDb); var numProteins = 0; var intactProteinAnnotationAndOffsets = indexedDb.IntactSequenceAnnotationsAndOffsets(0, int.MaxValue); var bestProtein = new string[run.MaxLcScan + 1]; var bestScore = new int[run.MaxLcScan + 1]; foreach (var annotationAndOffset in intactProteinAnnotationAndOffsets) { if (++numProteins % 10 == 0) { Console.WriteLine("Processing {0}{1} proteins...", numProteins, numProteins == 1 ? "st" : numProteins == 2 ? "nd" : numProteins == 3 ? "rd" : "th"); if (numProteins != 0) { sw.Stop(); Console.WriteLine("Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); sw.Reset(); sw.Start(); } } var annotation = annotationAndOffset.Annotation; var offset = annotationAndOffset.Offset; var protSequence = annotation.Substring(2, annotation.Length - 4); // suffix var seqGraph = SequenceGraph.CreateGraph(aminoAcidSet, AminoAcid.ProteinNTerm, protSequence, AminoAcid.ProteinCTerm); if (seqGraph == null) continue; for (var numNTermCleavage = 0; numNTermCleavage <= 0; numNTermCleavage++) { if (numNTermCleavage > 0) seqGraph.CleaveNTerm(); var allCompositions = seqGraph.GetAllFragmentNodeCompositions().ToArray(); var scoreArr = new int[run.MaxLcScan + 1]; foreach (var fragComp in allCompositions) { var suffixMass = fragComp.Mass + BaseIonType.Y.OffsetComposition.Mass; var binNum = comparer.GetBinNumber(suffixMass); if (binNum < minFragMassBin || binNum > maxFragMassBin) continue; var vector = massVectors[binNum - minFragMassBin]; foreach (var ms2ScanNum in ms2ScanNumArr) { if (vector[ms2ScanNum]) { ++scoreArr[ms2ScanNum]; Console.WriteLine(suffixMass); } } } foreach (var ms2ScanNum in ms2ScanNumArr) { if (scoreArr[ms2ScanNum] > bestScore[ms2ScanNum]) { bestScore[ms2ScanNum] = scoreArr[ms2ScanNum]; var proteinName = fastaDb.GetProteinName(offset); bestProtein[ms2ScanNum] = proteinName + (numNTermCleavage == 1 ? "'" : ""); } } } //// prefix //var seqGraphPrefix = SequenceGraph.CreateGraph(aminoAcidSet, AminoAcid.ProteinNTerm, protSequence, // AminoAcid.ProteinCTerm); //if (seqGraphPrefix == null) continue; //{ // if (numNTermCleavage > 0) seqGraph.CleaveNTerm(); // var allCompositions = seqGraph.GetAllFragmentNodeCompositions(); // var scoreArr = new int[run.MaxLcScan + 1]; // foreach (var fragComp in allCompositions) // { // var suffixMass = fragComp.Mass + BaseIonType.Y.OffsetComposition.Mass; // var binNum = comparer.GetBinNumber(suffixMass); // if (binNum < minFragMassBin || binNum > maxFragMassBin) continue; // var vector = massVectors[binNum - minFragMassBin]; // foreach (var ms2ScanNum in ms2ScanNumArr) // { // if (vector[ms2ScanNum]) ++scoreArr[ms2ScanNum]; // } // } // foreach (var ms2ScanNum in ms2ScanNumArr) // { // if (scoreArr[ms2ScanNum] > bestScore[ms2ScanNum]) // { // bestScore[ms2ScanNum] = scoreArr[ms2ScanNum]; // var proteinName = fastaDb.GetProteinName(offset); // bestProtein[ms2ScanNum] = proteinName + (numNTermCleavage == 1 ? "'" : ""); // } // } //} } Console.WriteLine("ScanNum\tBestProtein\tScore"); foreach (var ms2ScanNum in ms2ScanNumArr) { Console.WriteLine("{0}\t{1}\t{2}", ms2ScanNum, bestProtein[ms2ScanNum] ?? "", bestScore[ms2ScanNum]); } //sw.Stop(); //Console.WriteLine(@"Scoring: {0:f4} sec.", sw.Elapsed.TotalSeconds); }
public void TestFeatureIdMatching() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string resultFilePath = @"H:\Research\QCShew_TopDown\Production\M1_V092\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } var resultParser = new MsPathFinderParser(resultFilePath); const double qValueThreshold = 0.01; const double tolerancePpm = 13; const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3"; var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw"); if (!File.Exists(rawFileName)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFileName); } var run = PbfLcMsRun.GetLcMsRun(rawFileName); var idList = resultParser.GetIdList().TakeWhile(id => id.QValue <= qValueThreshold).OrderBy(id => id.Mass).ToList(); var idMassList = idList.Select(id => id.Mass).ToList(); var idFlag = new bool[idList.Count]; // Parse sequence tags var tagFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".seqtag"); const int minTagLength = 6; const int numProtMatches = 4; // const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta"; if (!File.Exists(tagFileName)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFileName); } if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); var tagParser = new SequenceTagParser(tagFileName, minTagLength); var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft"); var featureParser = new TsvFileParser(featureFileName); var minScan = featureParser.GetData("MinScan").Select(s => Convert.ToInt32(s)).ToArray(); var maxScan = featureParser.GetData("MaxScan").Select(s => Convert.ToInt32(s)).ToArray(); var minCharge = featureParser.GetData("MinCharge").Select(s => Convert.ToInt32(s)).ToArray(); var maxCharge = featureParser.GetData("MaxCharge").Select(s => Convert.ToInt32(s)).ToArray(); var monoMass = featureParser.GetData("MonoMass").Select(Convert.ToDouble).ToArray(); var numFeaturesWithId = 0; var numFeaturesWithMs2 = 0; var numFeaturesWithTags = 0; var numFeaturesWithMatchingTags = 0; var numFeaturesWithTwoOrMoreMatchingTags = 0; var numFeaturesWithNoIdAndMatchingTags = 0; for (var i = 0; i < featureParser.NumData; i++) { var mass = monoMass[i]; // Find Id var tolDa = new Tolerance(tolerancePpm).GetToleranceAsDa(mass, 1); var minMass = mass - tolDa; var maxMass = mass + tolDa; var index = idMassList.BinarySearch(mass); if (index < 0) index = ~index; var matchedId = new List<MsPathFinderId>(); // go down var curIndex = index - 1; while (curIndex >= 0) { var curId = idList[curIndex]; if (curId.Mass < minMass) break; if (curId.Scan > minScan[i] && curId.Scan < maxScan[i] && curId.Charge >= minCharge[i] && curId.Charge <= maxCharge[i]) { matchedId.Add(curId); idFlag[curIndex] = true; } --curIndex; } // go up curIndex = index; while (curIndex < idList.Count) { var curId = idList[curIndex]; if (curId.Mass > maxMass) break; if (curId.Scan >= minScan[i] && curId.Scan <= maxScan[i] && curId.Charge >= minCharge[i] && curId.Charge <= maxCharge[i]) { matchedId.Add(curId); idFlag[curIndex] = true; } ++curIndex; } var hasId = false; if (matchedId.Any()) { ++numFeaturesWithId; hasId = true; } // Find MS2 scans // var numMs2Scans = 0; var tags = new List<SequenceTag>(); var hasMs2 = false; for (var scanNum = minScan[i]; scanNum <= maxScan[i]; scanNum++) { var isolationWindow = run.GetIsolationWindow(scanNum); if (isolationWindow == null) continue; var isolationWindowTargetMz = isolationWindow.IsolationWindowTargetMz; var charge = (int)Math.Round(mass / isolationWindowTargetMz); if (charge < minCharge[i] || charge > maxCharge[i]) continue; var mz = Ion.GetIsotopeMz(mass, charge, Averagine.GetIsotopomerEnvelope(mass).MostAbundantIsotopeIndex); if (isolationWindow.Contains(mz)) { // ++numMs2Scans; tags.AddRange(tagParser.GetSequenceTags(scanNum)); hasMs2 = true; } } if (hasMs2) ++numFeaturesWithMs2; if (tags.Any()) ++numFeaturesWithTags; var protHist = new Dictionary<string, int>(); var hasMatchedTag = false; foreach (var tag in tags) { var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).Select(idx => fastaDb.GetProteinName(idx)).ToArray(); if (matchedProteins.Any()) { hasMatchedTag = true; foreach (var protein in matchedProteins) { int num; if (protHist.TryGetValue(protein, out num)) protHist[protein] = num + 1; else protHist[protein] = 1; } } } if (hasMatchedTag) { ++numFeaturesWithMatchingTags; if (!hasId) ++numFeaturesWithNoIdAndMatchingTags; } if (protHist.Any()) { var maxOcc = protHist.Values.Max(); if (maxOcc >= numProtMatches) ++numFeaturesWithTwoOrMoreMatchingTags; } } Console.WriteLine("NumFeatures: {0}", featureParser.NumData); Console.WriteLine("NumId: {0}", idList.Count); Console.WriteLine("NumFeaturesWithId: {0} ({1})", numFeaturesWithId, numFeaturesWithId / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithMs2: {0} ({1})", numFeaturesWithMs2, numFeaturesWithMs2 / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithTag: {0} ({1})", numFeaturesWithTags, numFeaturesWithTags / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithMatchedTag: {0} ({1})", numFeaturesWithMatchingTags, numFeaturesWithMatchingTags / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithMoreThanOneMatchedTag: {0} ({1})", numFeaturesWithTwoOrMoreMatchingTags, numFeaturesWithTwoOrMoreMatchingTags / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithNoIdAndMatchedTag: {0} ({1})", numFeaturesWithNoIdAndMatchingTags, numFeaturesWithNoIdAndMatchingTags / (float)featureParser.NumData); for (var i = 0; i < idFlag.Length; i++) { if (!idFlag[i]) { Console.WriteLine(idList[i].Scan); } } // Console.WriteLine(string.Join(",", filter.GetMatchingMs2ScanNums(8115.973001))); // // Console.WriteLine(featureFileName); }
public void TestFeatureId() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3"; if (!File.Exists(dataSet)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dataSet); } // Feature: 5236-5286 6-12 8480.3681 5 const int minScanNum = 5236; const int maxScanNum = 5286; const double featureMass = 8480.3681; //const int minScanNum = 7251; //const int maxScanNum = 7326; //const double featureMass = 32347.18; // const int minScanNum = 4451; // const int maxScanNum = 4541; // const double featureMass = 31267.95; var tolerance = new Tolerance(10); var relaxedTolerance = new Tolerance(20); const int minTagLength = 5; const int minMergedTagLength = 7; const int minNumTagMatches = 1; var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw"); var run = PbfLcMsRun.GetLcMsRun(rawFileName); var aminoAcidSet = AminoAcidSet.GetStandardAminoAcidSet(); var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft"); var filter = new Ms1FtFilter(run, tolerance, featureFileName); var ms2ScanNums = filter.GetMatchingMs2ScanNums(featureMass) .Where(scanNum => scanNum > minScanNum && scanNum < maxScanNum) .ToArray(); const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag"); const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); var tagParser = new SequenceTagParser(tagFileName, minTagLength); var proteinsToTags = new Dictionary<string, IList<MatchedTag>>(); foreach (var ms2ScanNum in ms2ScanNums) { var tags = tagParser.GetSequenceTags(ms2ScanNum); foreach (var tag in tags) { var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray(); foreach (var index in matchedIndices) { var protein = fastaDb.GetProteinName(index); var startIndex = fastaDb.GetZeroBasedPositionInProtein(index); var matchedTag = new MatchedTag(tag, startIndex, featureMass); IList<MatchedTag> existingTags; if (proteinsToTags.TryGetValue(protein, out existingTags)) { existingTags.Add(matchedTag); } else { proteinsToTags.Add(protein, new List<MatchedTag> { matchedTag }); } } } } foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count)) { if (entry.Value.Count < minNumTagMatches) break; var proteinName = entry.Key; var proteinSequence = fastaDb.GetProteinSequence(proteinName); var protein = new Sequence(proteinSequence, aminoAcidSet); Console.WriteLine(proteinName + "\t" + entry.Value.Count); var matchedTagSet = new MatchedTagSet(proteinSequence, aminoAcidSet, tolerance, relaxedTolerance); Console.WriteLine("********** Before merging"); foreach (var matchedTag in entry.Value) { var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}", (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); matchedTagSet.Add(matchedTag); } Console.WriteLine("********** After merging"); foreach (var matchedTag in matchedTagSet.Tags) { if (matchedTag.Length < minMergedTagLength) continue; var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}", (matchedTag.NTermFlankingMass-nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); } break; } }
public void TestTagMatchingSingleSpec() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3"; const int scanNum = 4533; // Parse sequence tags var tagFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".seqtag"); const int minTagLength = 8; const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(tagFileName)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFileName); } if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); var tagParser = new SequenceTagParser(tagFileName, minTagLength); var tags = tagParser.GetSequenceTags(scanNum); foreach (var tag in tags) { var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence) .Select(index => fastaDb.GetProteinName(index)).ToArray(); if (matchedProteins.Any()) { Console.WriteLine("{0}\t{1}\t{2}\t{3}", tag.Sequence, tag.IsPrefix, tag.FlankingMass, string.Join("\t", matchedProteins)); } } }
public void TestGetProteinsWithTagMatchingSingleSpec() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataSet = @"H:\Research\Lewy\raw\Lewy_intact_07"; // const int scanNum = 5158; const int minTagLength = 7; const int minNumTagMatches = 1; var aminoAcidSet = AminoAcidSet.GetStandardAminoAcidSet(); const int scanNum = 2; // Parse sequence tags //const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag"); const string rawFilePath = ""; const string fastaFilePath = @"H:\Research\Lewy\ID_004858_0EE8CF61.fasta"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); //var tagParser = new SequenceTagParser(tagFileName, minTagLength); //var tags = tagParser.GetSequenceTags(scanNum); var run = PbfLcMsRun.GetLcMsRun(rawFilePath); var spec = run.GetSpectrum(scanNum) as ProductSpectrum; var tagFinder = new SequenceTagFinder(spec, new Tolerance(5)); var tags = tagFinder.GetAllSequenceTagString(); var proteinsToTags = new Dictionary<string, IList<MatchedTag>>(); foreach (var tag in tags) { var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray(); foreach (var index in matchedIndices) { var protein = fastaDb.GetProteinName(index); var startIndex = fastaDb.GetOneBasedPositionInProtein(index); var matchedTag = new MatchedTag(tag, startIndex, 0.0); IList<MatchedTag> existingTags; if (proteinsToTags.TryGetValue(protein, out existingTags)) { existingTags.Add(matchedTag); } else { proteinsToTags.Add(protein, new List<MatchedTag> { matchedTag }); } } } foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count)) { if (entry.Value.Count < minNumTagMatches) break; var proteinName = entry.Key; var proteinSequence = fastaDb.GetProteinSequence(proteinName); var protein = new Sequence(proteinSequence, aminoAcidSet); Console.WriteLine(proteinName + "\t" + entry.Value.Count); foreach (var matchedTag in entry.Value) { var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0} ({1})\t{2}\t{3} ({4})\t{5}\t{6}\t{7}", matchedTag.NTermFlankingMass, (matchedTag.NTermFlankingMass - nTermMass), seq, matchedTag.CTermFlankingMass, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); } } }
public void CreateTargetList() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string databaseFilePath = @"D:\Research\Data\IPRG2014\database\SpikedInPeptides.fasta"; if (!File.Exists(databaseFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath); } var database = new FastaDatabase(databaseFilePath); database.Read(); var indexedDatabase = new IndexedDatabase(database); var numTargets = 0; var aaSet = new AminoAcidSet(Modification.Carbamidomethylation); Console.WriteLine("Peptide\tFormula\tProtein"); foreach (var annotationAndOffset in indexedDatabase.AnnotationsAndOffsets(6, 30, 1, 1, Enzyme.Trypsin)) { var annotation = annotationAndOffset.Annotation; var peptide = annotation.Substring(2, annotation.Length - 4); var offset = annotationAndOffset.Offset; Console.WriteLine("{0}\t{1}\t{2}", peptide, (aaSet.GetComposition(peptide) + Composition.H2O).ToPlainString(), database.GetProteinName(offset)); numTargets++; } Console.WriteLine("NumTargets: {0}", numTargets); }