public IList <SpectrumMatch> Read() { var specMatches = new List <SpectrumMatch>(); var tsvFile = new TsvFileParser(_fileName); var precursorCharges = tsvFile.GetData(PrecursorChargeHeader); var scans = tsvFile.GetData(ScanHeader); var peptides = tsvFile.GetData(TopDownPeptideHeader); if (peptides != null) { var peptideSet = new HashSet <string>(); const double filterThreshold = QValueThreshold; var filterValues = tsvFile.GetData(QValueHeader); var aset = new AminoAcidSet(); for (int i = 0; i < peptides.Count; i++) { if (Convert.ToDouble(filterValues[i]) > filterThreshold || peptideSet.Contains(peptides[i])) { continue; } peptideSet.Add(peptides[i]); var scanNum = Convert.ToInt32(scans[i]); int precursorCharge = Convert.ToInt32(precursorCharges[i]); specMatches.Add(new SpectrumMatch(new Sequence(peptides[i], aset), _lcms, scanNum, precursorCharge, _decoy)); } } return(specMatches); }
public void TestReadingTmtResultFile() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string filePath = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSGFPlusResultTMT10.tsv"; if (!File.Exists(filePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, filePath); } var parser = new TsvFileParser(filePath); var pepStrs = parser.GetData("Peptide"); var formulaStrs = parser.GetData("Formula"); Assert.True(pepStrs.Count == formulaStrs.Count); var peptides = pepStrs.Select(Sequence.GetSequenceFromMsGfPlusPeptideStr).ToList(); var formulae = formulaStrs.Select(Composition.Parse).ToList(); Assert.True(peptides.Count == formulae.Count); for (var i = 0; i < peptides.Count; i++) { Assert.True((peptides[i].Composition + Composition.H2O).Equals(formulae[i])); } }
public void TestReadingTmtResultFile() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var filePath = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, "MSGFPlusResultTMT10.tsv"); if (!File.Exists(filePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, filePath); } var parser = new TsvFileParser(filePath); var pepStrs = parser.GetData("Peptide"); var formulaStrs = parser.GetData("Formula"); Assert.True(pepStrs.Count == formulaStrs.Count); var peptides = pepStrs.Select(Sequence.GetSequenceFromMsGfPlusPeptideStr).ToList(); var formulae = formulaStrs.Select(Composition.Parse).ToList(); Assert.True(peptides.Count == formulae.Count); for (var i = 0; i < peptides.Count; i++) { Assert.True((peptides[i].Composition + Composition.H2O).Equals(formulae[i])); } }
private void Parse(string tagFilePath) { var tagParser = new TsvFileParser(tagFilePath); var scan = tagParser.GetData("ScanNum").Select(s => Convert.ToInt32(s)).ToArray(); var sequence = tagParser.GetData("SequenceTag").ToArray(); var isPrefix = tagParser.GetData("IsPrefix").Select(s => s.Equals("1")).ToArray(); var flankingMass = tagParser.GetData("FlankingMass").Select(Convert.ToDouble).ToArray(); for (var i = 0; i < tagParser.NumData; i++) { if (sequence[i].Length < _minTagLength) { continue; } var tag = new SequenceTag.SequenceTag(scan[i], sequence[i], isPrefix[i], flankingMass[i]); IList <SequenceTag.SequenceTag> tagList; if (_scanToTags.TryGetValue(scan[i], out tagList)) { if (tagList.Count < _numTagsPerScan) { tagList.Add(tag); } } else { _scanToTags.Add(scan[i], new List <SequenceTag.SequenceTag> { tag }); } } }
public void TestPredictPTMfromMs1ft() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string resultFilePath = @"\\protoapps\UserData\Jungkap\FeatureFinding\ProMex_v1.1\test\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } // const string ms1ftFilePath = @"\\protoapps\UserData\Jungkap\FeatureFinding\ProMex_v1.1\test\QC_Shew_Intact_26Sep14_Bane_C2Column3.ms1ft"; var parser = new TsvFileParser(resultFilePath); var sequences = parser.GetData("Sequence"); var modifications = parser.GetData("Modifications"); var scanNums = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var qValues = parser.GetData("QValue").Select(s => Convert.ToDouble(s)).ToArray(); var nMacthed = parser.GetData("#MatchedFragments"); var aaSet = new AminoAcidSet(); var ptmList = new List <Tuple <int, double, double> >(); for (var i = 0; i < parser.NumData; i++) { if (qValues[i] > 0.01) { continue; } //var sequenceComp = aaSet.GetComposition(sequences[i]) + Composition.H2O; var seq = new Sequence(sequences[i], aaSet); var sequenceComp = seq.Composition + Composition.H2O; var modComposition = Composition.Zero; var modsStr = modifications[i]; if (modsStr.Length == 0) { continue; } var mods = modsStr.Split(','); foreach (var modStr in mods.Where(str => str.Length > 0)) { var modName = modStr.Split()[0]; var mod = Modification.Get(modName); modComposition += mod.Composition; } Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}", scanNums[i], sequenceComp.Mass, modComposition.Mass, nMacthed[i], sequences[i], modsStr); //var compFromSeqAndMods = sequenceComp + modComposition; //Assert.True(compFromSeqAndMods.Equals(compositions[i])); ptmList.Add(new Tuple <int, double, double>(scanNums[i], sequenceComp.Mass, modComposition.Mass)); } //var featureParser = new TsvFileParser(ms1ftFilePath); //var minScan = featureParser.GetData("MinScan").Select(s => Convert.ToInt32(s)).ToArray(); //var maxScan = featureParser.GetData("MaxScan").Select(s => Convert.ToInt32(s)).ToArray(); //var monoMass = featureParser.GetData("MonoMass").Select(s => Convert.ToDouble(s)).ToArray(); }
public void DiaRankScore() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataFile = @"\\protoapps\UserData\Wilkins\BottomUp\HCD_QCShew\raw\QC_Shew_13_04_A_17Feb14_Samwise_13-07-28.raw"; const string tsvFile = @"\\protoapps\UserData\Wilkins\BottomUp\HCD_QCShew\tsv\QC_Shew_13_04_A_17Feb14_Samwise_13-07-28.tsv"; if (!File.Exists(dataFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dataFile); } if (!File.Exists(tsvFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tsvFile); } var parser = new TsvFileParser(tsvFile); var sequences = parser.GetData("Peptide"); var charges = parser.GetData("Charge"); var scans = parser.GetData("ScanNum"); var lcms = InMemoryLcMsRun.GetLcMsRun(dataFile, 0, 0); var rankScorer = new DiaRankScore( @"C:\Users\wilk011\Documents\DataFiles\TestFolder\HCD_QExactive_Tryp.txt"); using ( var outFile = new StreamWriter(@"C:\Users\wilk011\Documents\DataFiles\TestFolder\HCD_QCShew_Score_2.txt")) { outFile.WriteLine("Target\tDecoy"); for (int i = 0; i < sequences.Count; i++) { string sequenceStr = sequences[i]; int charge = Convert.ToInt32(charges[i]); int scan = Convert.ToInt32(scans[i]); var sequence = Sequence.GetSequenceFromMsGfPlusPeptideStr(sequenceStr); var decoySeq = Sequence.GetSequenceFromMsGfPlusPeptideStr(sequenceStr); decoySeq.Reverse(); var decoyStr = decoySeq.Aggregate("", (current, aa) => current + aa); decoyStr = SimpleStringProcessing.Mutate(decoyStr, sequence.Count / 2); decoySeq = Sequence.GetSequenceFromMsGfPlusPeptideStr(decoyStr); var sequenceScore = rankScorer.GetScore(sequence, charge, scan, lcms); var decoyScore = rankScorer.GetScore(decoySeq, charge, scan, lcms); outFile.WriteLine("{0}\t{1}", sequenceScore, decoyScore); } } }
public void TestInitialScoring() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string icResultPath = @"C:\cygwin\home\kims336\Data\QCShewQE\Ic_NTT2_03_NoMod_NoRescoring\QC_Shew_13_04_A_17Feb14_Samwise_13-07-28_IcTarget.tsv"; if (!File.Exists(icResultPath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, icResultPath); } var icParser = new TsvFileParser(icResultPath); var icScans = icParser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var icPeptides = icParser.GetData("Sequence"); var icScore = icParser.GetData("Score").Select(s => Convert.ToInt32(s)).ToArray(); var map = new Dictionary <string, int>(); for (var i = 0; i < icParser.NumData; i++) { map.Add(icScans[i] + ":" + icPeptides[i], icScore[i]); } const string msgfPlusResultPath = @"C:\cygwin\home\kims336\Data\QCShewQE\NoMod.tsv"; if (!File.Exists(msgfPlusResultPath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, msgfPlusResultPath); } var msgfPlusResults = new MsGfResults(msgfPlusResultPath); var matches = msgfPlusResults.GetMatchesAtPsmFdr(0.01); //Console.WriteLine("NumMatches: {0}", matches.Count); Console.WriteLine("ScanNum\tPeptide\tSpecEValue\tIcScore"); foreach (var match in matches) { var scanNum = match.ScanNum; var peptide = match.Peptide; var specEValue = match.SpecEValue; int score; if (!map.TryGetValue(scanNum + ":" + peptide, out score)) { score = -1; } Console.WriteLine("{0}\t{1}\t{2}\t{3}", scanNum, peptide, specEValue, score); } }
public void CompareRtFusion() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); // Fusion const string qeDdaResult = @"D:\Research\Data\UW\Fusion\DDA_Summary.tsv"; const string qeDiaResult = @"D:\Research\Data\UW\Fusion\DIA_Summary.tsv"; const string specFileDda = @"D:\Research\Data\UW\Fusion\WT_D_DDA_130412065618.raw"; var ddaReader = new XCaliburReader(specFileDda); const string specFileDia = @"D:\Research\Data\UW\Fusion\WT_D_DIA_130412091220.raw"; var diaReader = new XCaliburReader(specFileDia); const string resultPath1 = qeDdaResult; const string resultPath2 = qeDiaResult; var result1 = new TsvFileParser(resultPath1); var result2 = new TsvFileParser(resultPath2); const double pepQValueThreshold = 0.01; var vennDiagram = new VennDiagram <string>(result1.GetPeptides(pepQValueThreshold), result2.GetPeptides(pepQValueThreshold)); var intersectionPeptides = vennDiagram.Intersection; var result1Peptides = result1.GetData("Peptide"); var result1ScanNums = result1.GetData("ScanNum"); var result2Peptides = result2.GetData("Peptide"); var result2ScanNums = result2.GetData("ScanNum"); Console.WriteLine("Peptide\tScanNum1\tScanNum2\tRt1\tRt2"); foreach (var peptide in intersectionPeptides) { var index1 = result1Peptides.IndexOf(peptide); var index2 = result2Peptides.IndexOf(peptide); var scanNum1 = Convert.ToInt32(result1ScanNums[index1]); var scanNum2 = Convert.ToInt32(result2ScanNums[index2]); var reader1 = ddaReader; var reader2 = diaReader; Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", peptide.Replace("C+57.021", "C"), scanNum1, scanNum2, reader1.RtFromScanNum(scanNum1), reader2.RtFromScanNum(scanNum2)); } }
private void Read(string ms1FtFileName) { var ftFileParser = new TsvFileParser(ms1FtFileName); var monoMassArr = ftFileParser.GetData("MonoMass").Select(Convert.ToDouble).ToArray(); var minScanArray = ftFileParser.GetData("MinScan").Select(s => Convert.ToInt32(s)).ToArray(); var maxScanArray = ftFileParser.GetData("MaxScan").Select(s => Convert.ToInt32(s)).ToArray(); var repScanArray = ftFileParser.GetData("RepScan").Select(s => Convert.ToInt32(s)).ToArray(); var minChargeArray = ftFileParser.GetData("MinCharge").Select(s => Convert.ToInt32(s)).ToArray(); var maxChargeArray = ftFileParser.GetData("MaxCharge").Select(s => Convert.ToInt32(s)).ToArray(); var scoreArray = ftFileParser.GetData("LikelihoodRatio").Select(Convert.ToDouble).ToArray(); var featureCountFiltered = 0; for (var i = 0; i < monoMassArr.Length; i++) { //if (flagArray[i] == 0 && probArray[i] < _minProbability) continue; if (scoreArray[i] < _minLikelihoodRatio) { continue; } featureCountFiltered++; var monoMass = monoMassArr[i]; _lcMsChargeMap.SetMatches(monoMass, minScanArray[i], maxScanArray[i], repScanArray[i], minChargeArray[i], maxChargeArray[i]); } // NOTE: The DMS Analysis Manager looks for this statistic; do not change it Console.Write(@"{0}/{1} features loaded...", featureCountFiltered, monoMassArr.Length); _lcMsChargeMap.CreateMassToScanNumMap(); }
public void TestClusterCentricSearch() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string pfResultFilePath = @"H:\Research\QCShew_TopDown\Production\M1_V4_JP_Len500\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv"; if (!File.Exists(pfResultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, pfResultFilePath); } var tsvReader = new TsvFileParser(pfResultFilePath); var ms2Scans = tsvReader.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var compositions = tsvReader.GetData("Composition").ToArray(); var qValues = tsvReader.GetData("QValue").Select(Convert.ToDouble).ToArray(); var compScanTable = new Dictionary <string, IList <int> >(); for (var i = 0; i < qValues.Length; i++) { var qValue = qValues[i]; if (qValue > 0.01) { break; } IList <int> scanNums; if (compScanTable.TryGetValue(compositions[i], out scanNums)) { scanNums.Add(ms2Scans[i]); } else { compScanTable.Add(compositions[i], new List <int> { ms2Scans[i] }); } } Console.Write("NumCompositions: {0}", compScanTable.Keys.Count); //const string featureFilePath = @"H:\Research\QCShew_TopDown\Production\M1_V4_JP_Len500\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv"; }
private void Read(string isosFileName) { var icrToolsparser = new TsvFileParser(isosFileName, ','); var monoMassArr = icrToolsparser.GetData("monoisotopic_mw").Select(Convert.ToDouble).ToArray(); var scanArray = icrToolsparser.GetData("scan_num").Select(s => Convert.ToInt32(s)).ToArray(); var chargeArray = icrToolsparser.GetData("charge").Select(s => Convert.ToInt32(s)).ToArray(); var fitStringArr = icrToolsparser.GetData("fit"); var fitArray = fitStringArr == null ? null : icrToolsparser.GetData("fit").Select(Convert.ToDouble).ToArray(); var featureCountFiltered = 0; var minMass = double.MaxValue; var maxMass = 0.0; for (var i = 0; i < monoMassArr.Length; i++) { if (fitArray != null && fitArray[i] > _fitScoreThreshold || chargeArray[i] <= 1) { continue; } featureCountFiltered++; var scan = scanArray[i]; var monoMass = monoMassArr[i]; if (minMass > monoMass) { minMass = monoMass; } if (maxMass < monoMass) { maxMass = monoMass; } var minScan = _run.GetPrevScanNum(scan, 1); var maxScan = _run.GetNextScanNum(scan, 1); _lcMsMatchMap.SetMatches(monoMass, minScan, maxScan); } Console.Write(@"{0}/{1} features loaded...", featureCountFiltered, monoMassArr.Length); _lcMsMatchMap.CreateSequenceMassToMs2ScansMap(_run, _massTolerance, minMass, maxMass); }
public void ValidateIcResultsWithModifications() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const string resultFilePath = @"C:\cygwin\home\kims336\Data\TopDownJia\raw\Synocho_D1_1_Rescored.tsv"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } var parser = new TsvFileParser(resultFilePath); var sequences = parser.GetData("Sequence"); var modifications = parser.GetData("Modifications"); var compositions = parser.GetData("Composition").Select(Composition.Parse).ToArray(); var scanNums = parser.GetData("ScanNum").Select(s => Convert.ToInt32(s)).ToArray(); var aaSet = new AminoAcidSet(); for (var i = 0; i < parser.NumData; i++) { var sequenceComp = aaSet.GetComposition(sequences[i]) + Composition.H2O; var modComposition = Composition.Zero; var modsStr = modifications[i].Substring(1, modifications[i].Length - 2); var mods = modsStr.Split(','); foreach (var modStr in mods) { if (modStr.Length == 0) { continue; } var modName = modStr.Split()[0]; var mod = Modification.Get(modName); modComposition += mod.Composition; } var compFromSeqAndMods = sequenceComp + modComposition; Assert.True(compFromSeqAndMods.Equals(compositions[i])); } }
private void Rescore(string icResultFilePath, string outputFilePath) { var parser = new TsvFileParser(icResultFilePath); var sequences = parser.GetData("Sequence"); var scanNums = parser.GetData("ScanNum").Select(s => Convert.ToInt32(s)).ToArray(); var charges = parser.GetData("Charge").Select(c => Convert.ToInt32(c)).ToArray(); var compositions = parser.GetData("Composition").Select(Composition.Parse).ToArray(); var modIndex = parser.GetHeaders().IndexOf("Modifications"); var rows = parser.GetRows(); var headers = parser.GetHeaders(); using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("{0}\t{1}", string.Join("\t", headers), IcScores.GetScoreNames()); for (var i = 0; i < parser.NumData; i++) { var row = rows[i]; var seqStr = sequences[i]; var charge = charges[i]; var scanNum = scanNums[i]; var composition = compositions[i]; var scores = _topDownScorer.GetScores(AminoAcid.ProteinNTerm, seqStr, AminoAcid.ProteinCTerm, composition, charge, scanNum); var token = row.Split('\t'); for (var j = 0; j < token.Length; j++) { if (j != modIndex) { writer.Write(token[j] + "\t"); } else { writer.Write("[" + scores.Modifications + "]" + "\t"); } } writer.WriteLine(scores); } } }
private void Rescore(string msAlignFilePath, string outputFilePath) { var parser = new TsvFileParser(msAlignFilePath); var sequences = parser.GetData("Peptide"); var scanNums = parser.GetData("Scan(s)").Select(s => Convert.ToInt32(s)).ToArray(); var charges = parser.GetData("Charge").Select(c => Convert.ToInt32(c)).ToArray(); var rows = parser.GetRows(); var headers = parser.GetHeaders(); using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("{0}\t{1}", string.Join("\t", headers), IcScores.GetScoreNames()); for (var i = 0; i < parser.NumData; i++) { var row = rows[i]; var seqStr = SimpleStringProcessing.GetStringBetweenDots(sequences[i]); if (seqStr == null || seqStr.Contains("(")) { continue; //TODO: currently ignore ids with modifications } var composition = AASet.GetComposition(seqStr); //var sequence = new Sequence(seqStr, AASet); //if (sequence == null) //{ // Console.WriteLine("Ignore illegal sequence: {0}", seqStr); // continue; //} var charge = charges[i]; var scanNum = scanNums[i]; var scores = _topDownScorer.GetScores(AminoAcid.ProteinNTerm, seqStr, AminoAcid.ProteinCTerm, composition, charge, scanNum); if (scores == null) { continue; } writer.WriteLine("{0}\t{1}", row, scores); } } }
private void OutputMergedResult(TextWriter writer, TsvFileParser parser, FastaDatabase fastaDb) { var scoreColumn = parser.GetData("#MatchedFragments") ?? parser.GetData("Score"); var qValColumn = parser.GetData("QValue"); for (var i = 0; i < parser.NumData; i++) { var sequence = parser.GetData("Sequence")[i]; var scanNum = int.Parse(parser.GetData("Scan")[i]); var mass = double.Parse(parser.GetData("Mass")[i]); var protName = parser.GetData("ProteinName")[i]; var protDesc = fastaDb.GetProteinDescription(protName); var firstResId = int.Parse(parser.GetData("Start")[i]); var lastResId = int.Parse(parser.GetData("End")[i]); var score = double.Parse(scoreColumn[i]); var mod = parser.GetData("Modifications")[i]; var qvalue = (qValColumn != null) ? qValColumn[i] : "0"; writer.Write(scanNum); writer.Write("\t"); writer.Write(sequence); writer.Write("\t"); writer.Write(mod); writer.Write("\t"); writer.Write(mass); writer.Write("\t"); writer.Write(protName); writer.Write("\t"); writer.Write(protDesc); writer.Write("\t"); writer.Write(firstResId); writer.Write("\t"); writer.Write(lastResId); writer.Write("\t"); writer.Write(score); writer.Write("\t"); writer.Write(qvalue); writer.Write("\n"); } }
public void TestClusterCentricSearch(double qValueThreshold, int expectedNumCompositions) { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var resultFilePath = Path.Combine(Utils.DEFAULT_SPEC_FILES_FOLDER, "QC_Shew_Intact_26Sep14_Bane_C2Column3_Excerpt_IcTda.tsv"); var resultFile = Utils.GetTestFile(methodName, resultFilePath); var tsvReader = new TsvFileParser(resultFile.FullName); var ms2Scans = tsvReader.GetData("Scan").Select(s => Convert.ToInt32((string)s)).ToArray(); var compositions = tsvReader.GetData("Composition").ToArray(); var qValues = tsvReader.GetData("QValue").Select(Convert.ToDouble).ToArray(); var compScanTable = new Dictionary <string, IList <int> >(); for (var i = 0; i < qValues.Length; i++) { var qValue = qValues[i]; if (qValue > qValueThreshold) { break; } IList <int> scanNums; if (compScanTable.TryGetValue(compositions[i], out scanNums)) { scanNums.Add(ms2Scans[i]); } else { compScanTable.Add(compositions[i], new List <int> { ms2Scans[i] }); } } Console.Write("NumCompositions: {0}", compScanTable.Keys.Count); Assert.AreEqual(expectedNumCompositions, compScanTable.Keys.Count); }
public void CompareIpaIc() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const string resultDir = @"D:\Research\Data\UW\QExactive\Ic_NTT2_03"; if (!Directory.Exists(resultDir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, resultDir); } var targetPeptides = new HashSet <string>(); foreach (var icResultFilePath in Directory.GetFiles(resultDir, "*DIA*IcTarget.tsv")) { var icParser = new TsvFileParser(icResultFilePath); foreach (var peptide in icParser.GetData("Sequence")) { targetPeptides.Add(peptide); } } const string ipaResultPath = @"D:\Research\Data\UW\QExactive\DIA_All_Summary.tsv"; if (!File.Exists(ipaResultPath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, methodName); } var parser = new TsvFileParser(ipaResultPath); var ipaPeptides = parser.GetPeptides(0.005).Select(p => p.Replace("C+57.021", "C")); var ipaOnly = 0; var both = 0; foreach (var ipaPeptide in ipaPeptides) { if (targetPeptides.Contains(ipaPeptide)) { ++both; } else { ++ipaOnly; Console.WriteLine(ipaPeptide); } } Console.WriteLine("Both: {0}, IpaOnly: {1}, Sum: {2}", both, ipaOnly, both + ipaOnly); }
public IList <SpectrumMatch> Read() { var specMatches = new List <SpectrumMatch>(); var tsvFile = new TsvFileParser(_fileName); var precursorCharges = tsvFile.GetData(PrecursorChargeHeader); var scans = tsvFile.GetData(ScanHeader); var peptides = tsvFile.GetData(BottomUpPeptideHeader); if (scans == null) { throw new FormatException(); } var pepQValues = tsvFile.GetData(PepQValueHeader); var formulas = tsvFile.GetData(FormulaHeader); var peptideSet = new HashSet <string>(); for (int i = 0; i < peptides.Count; i++) { if (Convert.ToDouble(pepQValues[i]) > PepQValueThreshold || peptideSet.Contains(peptides[i])) { continue; } peptideSet.Add(peptides[i]); var scanNum = Convert.ToInt32(scans[i]); // var spectrum = lcms.GetSpectrum(scanNum); // var spec = spectrum as ProductSpectrum; // if (spec == null || spec.ActivationMethod != Act) continue; int precursorCharge = Convert.ToInt32(precursorCharges[i]); specMatches.Add((formulas != null && formulas[i] != null) ? new SpectrumMatch(peptides[i], DataFileFormat.IcBottomUp, _lcms, scanNum, precursorCharge, _decoy, formulas[i]) : new SpectrumMatch(peptides[i], DataFileFormat.IcBottomUp, _lcms, scanNum, precursorCharge, _decoy)); } return(specMatches); }
public void TestMs1EvidenceScore() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var testRawFile = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"TopDown\Lewy_ManyMods\Lewy_intact_01.pbf"); if (!File.Exists(testRawFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, testRawFile); } var testResultFile = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"TopDown\Lewy_ManyMods\TestOutput\Lewy_intact_01_IcTda.tsv"); if (!File.Exists(testResultFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, testResultFile); } var run = PbfLcMsRun.GetLcMsRun(testRawFile); var tsvParser = new TsvFileParser(testResultFile); var featureFinder = new LcMsPeakMatrix(run); for (var i = 0; i < tsvParser.NumData; i++) { var scan = int.Parse(tsvParser.GetData("Scan")[i]); var charge = int.Parse(tsvParser.GetData("Charge")[i]); var mass = double.Parse(tsvParser.GetData("Mass")[i]); var qvalue = double.Parse(tsvParser.GetData("QValue")[i]); //var targetFeature = new TargetFeature(mass, charge, scan); var score = featureFinder.GetMs1EvidenceScore(scan, mass, charge); Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", scan, mass, charge, qvalue, score); } }
public void TestMs1EvidenceScore() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string TestRawFile = @"\\protoapps\UserData\Jungkap\Lewy\Lewy_intact_01.pbf"; if (!File.Exists(TestRawFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, TestRawFile); } const string TestResultFile = @"\\protoapps\UserData\Jungkap\Lewy\Lewy_intact_01_IcTda.tsv"; if (!File.Exists(TestResultFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, TestResultFile); } var run = PbfLcMsRun.GetLcMsRun(TestRawFile); var tsvParser = new TsvFileParser(TestResultFile); var featureFinder = new LcMsPeakMatrix(run); for (var i = 0; i < tsvParser.NumData; i++) { var scan = int.Parse(tsvParser.GetData("Scan")[i]); var charge = int.Parse(tsvParser.GetData("Charge")[i]); var mass = double.Parse(tsvParser.GetData("Mass")[i]); var qvalue = double.Parse(tsvParser.GetData("QValue")[i]); //var targetFeature = new TargetFeature(mass, charge, scan); var score = featureFinder.GetMs1EvidenceScore(scan, mass, charge); Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", scan, mass, charge, qvalue, score); } }
public List <ProteinSpectrumMatch> ReadMsGfPlusResult(string msgfResultPath, int maxPrsm) { var parser = new TsvFileParser(msgfResultPath); var prsmList = new List <ProteinSpectrumMatch>(); var prevScanNum = -1; for (var i = 0; i < parser.NumData; i++) { var sequence = parser.GetData("Peptide")[i]; var scanNum = int.Parse(parser.GetData("Scan")[i]); if (prevScanNum == scanNum) { continue; } prevScanNum = scanNum; var mz = double.Parse(parser.GetData("PrecursorMZ")[i]); var protName = parser.GetData("Protein")[i]; var protDesc = ""; var score = double.Parse(parser.GetData("MSGFScore")[i]); var charge = int.Parse(parser.GetData("Charge")[i]); var seq = Sequence.GetSequenceFromMsGfPlusPeptideStr(sequence); var sequenceText = GetSequenceText(seq); var mass = (mz - Constants.Proton) * charge; var firstResId = 0; var lastResId = 0; var fdr = Double.Parse(parser.GetData("QValue")[i]); if (fdr > FdrCutoff) { continue; } var prsm = new ProteinSpectrumMatch(sequence, scanNum, mass, charge, protName, protDesc, firstResId, lastResId, score, ProteinSpectrumMatch.SearchTool.MsGfPlus) { SequenceText = sequenceText, }; prsmList.Add(prsm); if (prsmList.Count >= maxPrsm) { break; } } return(prsmList); }
public void GenerateVennDiagramsPeMmr() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); // No PE-MMR //const string noPeMmr = @"D:\Research\Data\PEMMR\iTRAQ_N33T34_10ug_100cm_300min_C2_061213.tsv"; // PE-MMR Scan based FDR //const string scanBasedPeMmr = @"D:\Research\Data\PEMMR\NewSpectra\iTRAQ_N33T34_10ug_100cm_300min_C2_061213_MX_PEMMR_UMCID_ScanFDR.tsv"; // UMC based FDR const string umcBasedPeMmr = @"D:\Research\Data\PEMMR\NewSpectra\iTRAQ_N33T34_10ug_100cm_300min_C2_061213_MX_PEMMR_UMCID_UMCFDR.tsv"; // IPA const string ipa = @"D:\Research\Data\PEMMR\Ox\IPA_Summary_TargetOnly.tsv"; const string resultPath1 = umcBasedPeMmr; const string resultPath2 = ipa; var result1 = new TsvFileParser(resultPath1); var result2 = new TsvFileParser(resultPath2); const double pepQValueThreshold = 0.01; var vennDiagram = new VennDiagram <string>(result1.GetPeptides(pepQValueThreshold), result2.GetPeptides(pepQValueThreshold)); Console.WriteLine("{0}\t{1}\t{2}", vennDiagram.Set1Only.Count + vennDiagram.Intersection.Count, vennDiagram.Intersection.Count, vennDiagram.Set2Only.Count + vennDiagram.Intersection.Count); Console.WriteLine("{0}\t{1}\t{2}", vennDiagram.Set1Only.Count, vennDiagram.Intersection.Count, vennDiagram.Set2Only.Count); foreach (var peptide in vennDiagram.Set2Only) { Console.WriteLine(peptide); var peptides = result2.GetData("Peptide"); } }
private double[][] LoadTable(string fname) { if (!File.Exists(fname)) { throw new FileNotFoundException("Missing score datafile: " + fname); } var parser = new TsvFileParser(fname); var table = new double[_massBins.Length][]; for (var i = 0; i < _massBins.Length; i++) { table[i] = new double[NumberOfBins]; for (var k = 0; k < NumberOfBins; k++) { var colData = parser.GetData(string.Format("{0}", k)); table[i][k] = double.Parse(colData[i]); } } return(table); }
public void CountMatchedProteins() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const int minTagLength = 3; var scanToProtein = new Dictionary <int, string>(); var idTag = new Dictionary <int, bool>(); const string resultFilePath = @"H:\Research\ProMex\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } var parser = new TsvFileParser(resultFilePath); var scans = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var proteinNames = parser.GetData("ProteinName").ToArray(); var qValues = parser.GetData("QValue").Select(Convert.ToDouble).ToArray(); for (var i = 0; i < qValues.Length; i++) { if (qValues[i] > 0.01) { break; } scanToProtein.Add(scans[i], proteinNames[i]); idTag.Add(scans[i], false); } const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } var run = PbfLcMsRun.GetLcMsRun(rawFilePath); const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } // const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta"; // const string fastaFilePath = // @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); Console.WriteLine("Sequence length: {0}", fastaDb.GetSequence().Length); const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.seqtag"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } var hist = new Dictionary <int, int>(); var scanSet = new HashSet <int>(); HashSet <string> proteinSetForThisScan = null; var prevScan = -1; var totalNumMatches = 0L; var isHeader = true; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; continue; } var token = line.Split('\t'); if (token.Length < 3) { continue; } var scan = Convert.ToInt32(token[0]); var proteinId = scanToProtein.ContainsKey(scan) ? scanToProtein[scan] : null; if (scan != prevScan) { if (proteinSetForThisScan != null) { var numMatches = proteinSetForThisScan.Count; int numOcc; if (hist.TryGetValue(numMatches, out numOcc)) { hist[numMatches] = numOcc + 1; } else { hist.Add(numMatches, 1); } } prevScan = scan; proteinSetForThisScan = new HashSet <string>(); } scanSet.Add(scan); var tag = token[1]; if (tag.Length < minTagLength) { continue; } if (proteinSetForThisScan == null) { continue; } var numMatchesForThisTag = 0; foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index))) { proteinSetForThisScan.Add(matchedProtein); ++numMatchesForThisTag; if (proteinId != null && matchedProtein.Equals(proteinId)) { idTag[scan] = true; } } totalNumMatches += numMatchesForThisTag; // if (numMatchesForThisTag > 10) // { // Console.WriteLine("{0}\t{1}", tag, numMatchesForThisTag); // } } if (proteinSetForThisScan != null) { var numMatches = proteinSetForThisScan.Count; int numOcc; if (hist.TryGetValue(numMatches, out numOcc)) { hist[numMatches] = numOcc + 1; } else { hist.Add(numMatches, 1); } } Console.WriteLine("AvgNumMatches: {0}", totalNumMatches / (float)scanSet.Count); Console.WriteLine("Histogram:"); foreach (var entry in hist.OrderBy(e => e.Key)) { Console.WriteLine("{0}\t{1}", entry.Key, entry.Value); } Console.WriteLine("NumId: {0}", idTag.Count); Console.WriteLine("NumIdByTag: {0}", idTag.Select(e => e.Value).Count(v => v)); }
public void TestTagAlignedFeatures() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var featureDir = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, "Output"); var mspDir = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"Output\MSP"); var outFile = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"Output\aligned_features.tsv"); var resultFile = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"\Output\aligned_ids.tsv"); if (!Directory.Exists(featureDir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, featureDir); } if (!Directory.Exists(mspDir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, mspDir); } if (!File.Exists(outFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, outFile); } var dataset = GetDataList(featureDir); var tsvParser = new TsvFileParser(outFile); var massList = new List <double>(); for (var i = 0; i < tsvParser.NumData; i++) { massList.Add(Double.Parse(tsvParser.GetData("MonoMass")[i])); } var featureIdMap = new Dictionary <int, string>(); var tolerance = new Tolerance(12); var headers = new List <string>(); //foreach (var data in dataset) for (var d = 0; d < dataset.Count; d++) { var data = dataset[d]; var minScanColName = string.Format("{0}_minScan", d); var maxScanColName = string.Format("{0}_maxScan", d); var fname = string.Format(@"{0}\{1}_IcTda.tsv", mspDir, data); var idParser = new TsvFileParser(fname); var idRows = idParser.GetRows(); if (headers.Count < 1) { headers.AddRange(idParser.GetHeaders()); } for (var i = 0; i < idParser.NumData; i++) { var scan = Int32.Parse(idParser.GetData("Scan")[i]); var mass = Double.Parse(idParser.GetData("Mass")[i]); var qvalue = Double.Parse(idParser.GetData("QValue")[i]); if (qvalue > 0.01) { break; } var massTol = tolerance.GetToleranceAsMz(mass); var idx = massList.BinarySearch(mass); if (idx < 0) { idx = ~idx; } var found = false; for (var j = idx; j >= 0; j--) { if (Math.Abs(mass - massList[j]) > massTol) { break; } if (tsvParser.GetData(minScanColName)[j].Length < 1) { continue; } if (Int32.Parse(tsvParser.GetData(minScanColName)[j]) < scan && scan < Int32.Parse(tsvParser.GetData(maxScanColName)[j])) { found = true; if (!featureIdMap.ContainsKey(j)) { featureIdMap.Add(j, idRows[i]); } break; } } if (found) { continue; } for (var j = idx + 1; j < massList.Count; j++) { if (Math.Abs(mass - massList[j]) > massTol) { break; } if (tsvParser.GetData(minScanColName)[j].Length < 1) { continue; } if (Int32.Parse(tsvParser.GetData(minScanColName)[j]) < scan && scan < Int32.Parse(tsvParser.GetData(maxScanColName)[j])) { found = true; if (!featureIdMap.ContainsKey(j)) { featureIdMap.Add(j, idRows[i]); } break; } } } } var writer = new StreamWriter(resultFile); writer.Write("AlignedFeatureID"); writer.Write("\t"); writer.Write(string.Join("\t", headers)); for (var i = 0; i < 32; i++) { writer.Write("\t"); writer.Write("{0}", i); } writer.Write("\n"); var id = 1; foreach (var key in featureIdMap.Keys) { writer.Write(id); writer.Write("\t"); writer.Write(featureIdMap[key]); for (var i = 0; i < 32; i++) { writer.Write("\t"); writer.Write("{0}", tsvParser.GetData(string.Format("{0}", i))[key]); } writer.Write("\n"); id++; } writer.Close(); }
public void TestCompositeScoring() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); //const string rawFilePath = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\SpecFiles\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; const string rawFilePath = @"D:\MassSpecFiles\training\raw\QC_Shew_Intact_26Sep14_Bane_C2Column3.pbf"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } // Configure amino acid set var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false); var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false); const int numMaxModsPerProtein = 4; var searchModifications = new List <SearchModification> { dehydroC, oxM, acetylN }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); var comparer = new FilteredProteinMassBinning(aaSet, 50000, 28); var run = PbfLcMsRun.GetLcMsRun(rawFilePath); const double filteringWindowSize = 1.1; const int isotopeOffsetTolerance = 2; var tolerance = new Tolerance(10); const int minCharge = 1; const int maxCharge = 20; var graphFactory = new ProteinScoringGraphFactory(comparer, aaSet); var aminoAcidSet = new AminoAcidSet(); //var scorer = new MatchedPeakPostScorer(tolerance, minCharge, maxCharge); var scorer = new InformedTopDownScorer(run, aminoAcidSet, minCharge, maxCharge, tolerance); var fileExt = new string[] { "IcTarget", "IcDecoy" }; foreach (var ext in fileExt) { var resultFileName = string.Format(@"D:\MassSpecFiles\training\Rescoring\QC_Shew_Intact_26Sep14_Bane_C2Column3_{0}.tsv", ext); var parser = new TsvFileParser(resultFileName); var scans = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var charges = parser.GetData("Charge").Select(s => Convert.ToInt32(s)).ToArray(); var protSequences = parser.GetData("Sequence").ToArray(); var modStrs = parser.GetData("Modifications").ToArray(); var compositions = parser.GetData("Composition").Select(Composition.Parse).ToArray(); var protMass = parser.GetData("Mass").Select(s => Convert.ToDouble(s)).ToArray(); var outputFileName = string.Format(@"D:\MassSpecFiles\training\Rescoring\QC_Shew_Intact_26Sep14_Bane_C2Column3_{0}_Rescored.tsv", ext); using (var writer = new StreamWriter(outputFileName)) { writer.WriteLine(string.Join("\t", parser.GetHeaders().ToArray(), 0, 15) + "\tScore\tEValue"); var lines = new string[parser.NumData]; //for (var i = 0; i < parser.NumData; i++) Parallel.For(0, parser.NumData, i => { var scan = scans[i]; var charge = charges[i]; var protSequence = protSequences[i]; var modStr = modStrs[i]; var sequence = Sequence.CreateSequence(protSequence, modStr, aminoAcidSet); Assert.True(sequence.Composition.Equals(compositions[i] - Composition.H2O)); var ms2Spec = run.GetSpectrum(scan) as ProductSpectrum; Assert.True(ms2Spec != null); var scores = scorer.GetScores(sequence, charge, scan); var deconvSpec = Deconvoluter.GetDeconvolutedSpectrum(ms2Spec, minCharge, maxCharge, isotopeOffsetTolerance, filteringWindowSize, tolerance, 0.7); var deconvScorer = new CompositeScorerBasedOnDeconvolutedSpectrum(deconvSpec, ms2Spec, tolerance, comparer); var graph = graphFactory.CreateScoringGraph(deconvScorer, protMass[i]); var gf = new GeneratingFunction(graph); gf.ComputeGeneratingFunction(); var specEvalue = gf.GetSpectralEValue(scores.Score); var rowStr = parser.GetRows()[i]; var items = rowStr.Split('\t').ToArray(); var newRowStr = string.Join("\t", items, 0, 15); //writer.WriteLine("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue); lock (lines) { lines[i] = string.Format("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue); } //Console.WriteLine("{0}\t{1}\t{2}", items[0], scores.Score, specEvalue); }); foreach (var line in lines) { writer.WriteLine(line); } } Console.WriteLine("Done"); } }
public List <ProteinSpectrumMatch> ReadMsAlignResult(string msAlignResultTablePath, int maxPrsm) { var parser = new TsvFileParser(msAlignResultTablePath); var prsmList = new List <ProteinSpectrumMatch>(); for (var i = 0; i < parser.NumData; i++) { var sequence = parser.GetData("Peptide")[i]; var scanNum = int.Parse(parser.GetData("Scan(s)")[i]); var mass = double.Parse(parser.GetData("Precursor_mass")[i]); var protNameDesc = parser.GetData("Protein_name")[i]; var k = protNameDesc.IndexOf(' '); var protName = (k < 0) ? protNameDesc : protNameDesc.Substring(0, k); var protDesc = (k < 0) ? protNameDesc : protNameDesc.Substring(k + 1); var firstResId = int.Parse(parser.GetData("First_residue")[i]); var lastResId = int.Parse(parser.GetData("Last_residue")[i]); var score = double.Parse(parser.GetData("#matched_fragment_ions")[i]); var sequenceText = parser.GetData("Peptide")[i]; var charge = int.Parse(parser.GetData("Charge")[i]); var evalue = double.Parse(parser.GetData("E-value")[i]); var fdr = Double.Parse(parser.GetData("FDR")[i]); if (fdr > FdrCutoff) { continue; } var prsm = new ProteinSpectrumMatch(sequence, scanNum, mass, charge, protName, protDesc, firstResId, lastResId, score, ProteinSpectrumMatch.SearchTool.MsAlign) { SequenceText = sequenceText, SpectralEvalue = evalue, }; prsmList.Add(prsm); if (prsmList.Count >= maxPrsm) { break; } } return(prsmList); }
public List <ProteinSpectrumMatch> ReadMsPathFinderResult(string msPathFinderResultPath, int maxPrsm, double minScore = 3, double maxScore = int.MaxValue) { var parser = new TsvFileParser(msPathFinderResultPath); var prsmList = new List <ProteinSpectrumMatch>(); var scoreColumn = parser.GetData("#MatchedFragments") ?? parser.GetData("Score"); var qValColumn = parser.GetData("QValue"); var evalueColumn = parser.GetData("SpecEValue"); for (var i = 0; i < parser.NumData; i++) { var sequence = parser.GetData("Sequence")[i]; var scanNum = int.Parse(parser.GetData("Scan")[i]); var mass = double.Parse(parser.GetData("Mass")[i]); var protName = parser.GetData("ProteinName")[i]; var protDesc = parser.GetData("ProteinDesc")[i]; var charge = int.Parse(parser.GetData("Charge")[i]); var firstResId = int.Parse(parser.GetData("Start")[i]); var lastResId = int.Parse(parser.GetData("End")[i]); var score = double.Parse(scoreColumn[i]); var mod = parser.GetData("Modifications")[i]; var evalue = (evalueColumn != null) ? double.Parse(parser.GetData("SpecEValue")[i]) : 0; var pre = parser.GetData("Pre")[i]; var post = parser.GetData("Post")[i]; var proteinLen = int.Parse(parser.GetData("ProteinLength")[i]); if (score < minScore || score > maxScore) { continue; } if (qValColumn != null) { var fdr = double.Parse(qValColumn[i]); if (fdr > FdrCutoff) { continue; } } var sequenceText = GetSequenceText(sequence, mod); var prsm = new ProteinSpectrumMatch(sequence, scanNum, mass, charge, protName, protDesc, firstResId, lastResId, score, ProteinSpectrumMatch.SearchTool.MsPathFinder) { SequenceText = sequenceText, Modifications = mod, Pre = pre, Post = post, ProteinLength = proteinLen, SpectralEvalue = evalue, }; prsmList.Add(prsm); if (prsmList.Count >= maxPrsm) { break; } } return(prsmList); }
public void TestMs1Filtering() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const string resultFilePath = // @"C:\cygwin\home\kims336\Data\TopDown\raw\CorrMatches_N30\SBEP_STM_001_02272012_Aragon.tsv"; @"C:\cygwin\home\kims336\Data\TopDown\raw\CorrMatches_N30\SBEP_STM_001_02272012_Aragon.decoy.icresult"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } const string rawFilePath = @"C:\cygwin\home\kims336\Data\TopDown\raw\DataFiles\SBEP_STM_001_02272012_Aragon.raw"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } var run = InMemoryLcMsRun.GetLcMsRun(rawFilePath, 1.4826, 1.4826); //const int minPrecursorCharge = 3; //const int maxPrecursorCharge = 30; //const int tolerancePpm = 15; var tolerance = new Tolerance(15); //var ms1BasedFilter = new Ms1IsotopeCorrFilter(run, minPrecursorCharge, maxPrecursorCharge, tolerancePpm, 0.7, 40); ////var ms1BasedFilter = new Ms1IsotopeTopKFilter(run, minPrecursorCharge, maxPrecursorCharge, tolerancePpm, 20); //ISequenceFilter ms1Filter = ms1BasedFilter; var tsvReader = new TsvFileParser(resultFilePath); var compositions = tsvReader.GetData("Composition"); var scanNums = tsvReader.GetData("ScanNum"); var charges = tsvReader.GetData("Charge"); var qValues = tsvReader.GetData("QValue"); var scores = tsvReader.GetData("Score"); //var sequences = tsvReader.GetData("Annotation"); //var hist = new int[11]; Console.WriteLine("ScanNum\tScore\tPrecursor\tNext\tSum\tNextIsotope\tLessCharge\tMoreCharge\tMax\tNumXicPeaks"); for (var i = 0; i < compositions.Count; i++) { if (qValues != null) { var qValue = Convert.ToDouble(qValues[i]); if (qValue > 0.01) { continue; } } var scanNum = Convert.ToInt32(scanNums[i]); var composition = Composition.Parse(compositions[i]); var charge = Convert.ToInt32(charges[i]); var precursorIon = new Ion(composition, charge); var isValid = run.GetSpectrum(scanNum) is ProductSpectrum spec && spec.IsolationWindow.Contains(precursorIon.GetMostAbundantIsotopeMz()); if (!isValid) { continue; } var score = Convert.ToDouble(scores[i]); var precursorScanNum = run.GetPrecursorScanNum(scanNum); var precursorSpec = run.GetSpectrum(precursorScanNum); var preIsotopeCorr = precursorSpec.GetCorrScore(precursorIon, tolerance, 0.1); var nextScanNum = run.GetNextScanNum(scanNum, 1); var nextSpec = run.GetSpectrum(nextScanNum); var nextIsotopeCorr = nextSpec.GetCorrScore(precursorIon, tolerance, 0.1); var xicMostAbundant = run.GetPrecursorExtractedIonChromatogram(precursorIon.GetMostAbundantIsotopeMz(), tolerance, scanNum); var apexScanNum = xicMostAbundant.GetApexScanNum(); if (apexScanNum < run.MinLcScan) { apexScanNum = scanNum; } //var sumSpec = run.GetSummedMs1Spectrum(apexScanNum); // var apexIsotopeCorr = sumSpec.GetCorrScore(precursorIon, tolerance, 0.1); // var corr3 = ms1Filter.GetMatchingMs2ScanNums(composition.Mass).Contains(scanNum) ? 1 : 0; var xicNextIsotope = run.GetPrecursorExtractedIonChromatogram(precursorIon.GetMostAbundantIsotopeMz() + Constants.C13MinusC12 / charge, tolerance, scanNum); var plusOneIsotopeCorr = xicMostAbundant.GetCorrelation(xicNextIsotope); var precursorIonChargeMinusOne = new Ion(composition, charge - 1); var xicChargeMinusOne = run.GetPrecursorExtractedIonChromatogram(precursorIonChargeMinusOne.GetMostAbundantIsotopeMz(), tolerance, scanNum); var chargeMinusOneCorr = xicMostAbundant.GetCorrelation(xicChargeMinusOne); var precursorIonChargePlusOne = new Ion(composition, charge + 1); var xicChargePlusOne = run.GetPrecursorExtractedIonChromatogram(precursorIonChargePlusOne.GetMostAbundantIsotopeMz(), tolerance, scanNum); var chargePlusOneCorr = xicMostAbundant.GetCorrelation(xicChargePlusOne); //var max = new[] {preIsotopeCorr, nextIsotopeCorr, apexIsotopeCorr, plusOneIsotopeCorr, chargeMinusOneCorr, chargePlusOneCorr}.Max(); //Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}", // scanNum, score, preIsotopeCorr, nextIsotopeCorr, apexIsotopeCorr, plusOneIsotopeCorr, chargeMinusOneCorr, chargePlusOneCorr, max, xicMostAbundant.Count); } //Console.WriteLine("Histogram"); //for (var i = 0; i < hist.Length; i++) //{ // Console.WriteLine("{0:f1}\t{1}", i / 10.0, hist[i]); //} }
public void ExtractLcMsFeaturesForTrainingSet() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string idFileFolder = @"D:\MassSpecFiles\training\FilteredIdResult"; if (!Directory.Exists(idFileFolder)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, idFileFolder); } var tolerance = new Tolerance(10); var tolerance2 = new Tolerance(20); var id = 1; for (var d = 0; d < TrainSetFileLists.Length; d++) { var dataset = TrainSetFileLists[d]; var dataname = Path.GetFileNameWithoutExtension(dataset); var filtedIdResultFile = string.Format(@"{0}\{1}.trainset.tsv", idFileFolder, Path.GetFileNameWithoutExtension(dataset)); var featureResult = string.Format(@"{0}\{1}.ms1ft", idFileFolder, Path.GetFileNameWithoutExtension(dataset)); if (!File.Exists(dataset)) { Console.WriteLine(@"Warning: Skipping since file not found: {0}", dataset); continue; } if (!File.Exists(filtedIdResultFile)) { Console.WriteLine(@"Warning: Skipping since file not found: {0}", filtedIdResultFile); continue; } var run = PbfLcMsRun.GetLcMsRun(dataset); var targetStatWriter = new StreamWriter(string.Format(@"D:\MassSpecFiles\training\statistics\{0}.tsv", Path.GetFileNameWithoutExtension(dataset))); var decoyStatWriter = new StreamWriter(string.Format(@"D:\MassSpecFiles\training\statistics\{0}_decoy.tsv", Path.GetFileNameWithoutExtension(dataset))); var writer = new StreamWriter(featureResult); writer.Write("Ms2MinScan\tMs2MaxScan\tMs2MinCharge\tMs2MaxCharge\tMs2Mass\t"); writer.Write("Mass\tMinScan\tMaxScan\tMinCharge\tMaxCharge\tMinTime\tMaxTime\tElution\tGood\n"); var tsvParser = new TsvFileParser(filtedIdResultFile); var featureFinder = new LcMsPeakMatrix(run); for (var i = 0; i < tsvParser.NumData; i++) { var minScan = int.Parse(tsvParser.GetData("MinScan")[i]); var maxScan = int.Parse(tsvParser.GetData("MaxScan")[i]); var minCharge = int.Parse(tsvParser.GetData("MinCharge")[i]); var maxCharge = int.Parse(tsvParser.GetData("MaxCharge")[i]); var mass = double.Parse(tsvParser.GetData("Mass")[i]); writer.Write(minScan); writer.Write("\t"); writer.Write(maxScan); writer.Write("\t"); writer.Write(minCharge); writer.Write("\t"); writer.Write(maxCharge); writer.Write("\t"); writer.Write(mass); writer.Write("\t"); var binNum = featureFinder.Comparer.GetBinNumber(mass); var binMass = featureFinder.Comparer.GetMzAverage(binNum); var binNumList = (mass < binMass) ? new int[] { binNum, binNum - 1, binNum + 1 } : new int[] { binNum, binNum + 1, binNum - 1 }; LcMsPeakCluster refinedFeature = null; foreach (var bi in binNumList) { var tempList = new List <LcMsPeakCluster>(); var features = featureFinder.FindFeatures(bi); var massTh = (mass < 2000) ? tolerance2.GetToleranceAsTh(mass) : tolerance.GetToleranceAsTh(mass); foreach (var feature in features) { if (Math.Abs(mass - feature.Mass) < massTh) { tempList.Add(feature); } } //var nHits = 0; var highestAbu = 0d; //var scans = Enumerable.Range(minScan, maxScan - minScan + 1); foreach (var feature in tempList) { //var scans2 = Enumerable.Range(feature.MinScanNum, feature.MaxScanNum - feature.MinScanNum + 1); //var hitScans = scans.Intersect(scans2).Count(); if (feature.MinScanNum < 0.5 * (minScan + maxScan) && 0.5 * (minScan + maxScan) < feature.MaxScanNum) { if (feature.Abundance > highestAbu) { refinedFeature = feature; highestAbu = feature.Abundance; } } /*if (hitScans > 0) * { * refinedFeature = feature; * nHits = hitScans; * }*/ } if (refinedFeature != null) { break; } } if (refinedFeature != null) { writer.Write(refinedFeature.Mass); writer.Write("\t"); writer.Write(refinedFeature.MinScanNum); writer.Write("\t"); writer.Write(refinedFeature.MaxScanNum); writer.Write("\t"); writer.Write(refinedFeature.MinCharge); writer.Write("\t"); writer.Write(refinedFeature.MaxCharge); writer.Write("\t"); writer.Write(refinedFeature.MinElutionTime); writer.Write("\t"); writer.Write(refinedFeature.MaxElutionTime); writer.Write("\t"); writer.Write(refinedFeature.MaxElutionTime - refinedFeature.MinElutionTime); writer.Write("\t"); var good = (refinedFeature.MinScanNum <= minScan && refinedFeature.MaxScanNum >= maxScan); writer.Write(good ? 1 : 0); writer.Write("\n"); //writer.Write(0); writer.Write("\t"); //writer.Write(0); writer.Write("\n"); OutputEnvelopPeakStat(id, refinedFeature, targetStatWriter); var chargeRange = featureFinder.GetDetectableMinMaxCharge(refinedFeature.RepresentativeMass, run.MinMs1Mz, run.MaxMs1Mz); refinedFeature.UpdateWithDecoyScore(featureFinder.Ms1Spectra, chargeRange.Item1, chargeRange.Item2); OutputEnvelopPeakStat(id, refinedFeature, decoyStatWriter); id++; } else { writer.Write(0); writer.Write("\t"); writer.Write(0); writer.Write("\t"); writer.Write(0); writer.Write("\t"); writer.Write(0); writer.Write("\t"); writer.Write(0); writer.Write("\t"); writer.Write(0); writer.Write("\t"); writer.Write(0); writer.Write("\t"); writer.Write(0); writer.Write("\t"); writer.Write(0); writer.Write("\n"); } //var feature = featureFinder.FindLcMsPeakCluster(mass, (int) scan, (int) charge); } writer.Close(); targetStatWriter.Close(); decoyStatWriter.Close(); Console.WriteLine(dataname); } }