private void Rescore(string icResultFilePath, string outputFilePath) { var parser = new TsvFileParser(icResultFilePath); var sequences = parser.GetData("Sequence"); var scanNums = parser.GetData("ScanNum").Select(s => Convert.ToInt32(s)).ToArray(); var charges = parser.GetData("Charge").Select(c => Convert.ToInt32(c)).ToArray(); var compositions = parser.GetData("Composition").Select(Composition.Parse).ToArray(); var modIndex = parser.GetHeaders().IndexOf("Modifications"); var rows = parser.GetRows(); var headers = parser.GetHeaders(); using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("{0}\t{1}", string.Join("\t", headers), IcScores.GetScoreNames()); for (var i = 0; i < parser.NumData; i++) { var row = rows[i]; var seqStr = sequences[i]; var charge = charges[i]; var scanNum = scanNums[i]; var composition = compositions[i]; var scores = _topDownScorer.GetScores(AminoAcid.ProteinNTerm, seqStr, AminoAcid.ProteinCTerm, composition, charge, scanNum); var token = row.Split('\t'); for (var j = 0; j < token.Length; j++) { if (j != modIndex) { writer.Write(token[j] + "\t"); } else { writer.Write("[" + scores.Modifications + "]" + "\t"); } } writer.WriteLine(scores); } } }
public string ProcessFile(string rawFile, string resultFile, string methodName) { if (!File.Exists(rawFile)) { Console.WriteLine(@"Warning: Skipping test {0} since file not found: {1}", methodName, rawFile); return("\n"); } if (!File.Exists(resultFile)) { Console.WriteLine(@"Warning: Skipping test {0} since file not found: {1}", methodName, resultFile); return("\n"); } var tsvParser = new TsvFileParser(resultFile); var headerList = tsvParser.GetHeaders(); var tsvData = tsvParser.GetAllData(); var ms2ScanNumbers = tsvData["Scan"]; var run = PbfLcMsRun.GetLcMsRun(rawFile, 0, 0); var resultLine = ""; for (int i = 0; i < ms2ScanNumbers.Count; i++) { var scanNum = Int32.Parse(ms2ScanNumbers[i]); var spectrum = run.GetSpectrum(scanNum) as ProductSpectrum; int tsvIndex = ms2ScanNumbers.FindIndex(x => Int32.Parse(x) == scanNum); var qValue = Double.Parse(tsvData["QValue"].ElementAt(tsvIndex)); if (qValue > 0.01) { continue; } var seqStr = tsvData["Sequence"].ElementAt(tsvIndex).Trim(); var seqMod = tsvData["Modifications"].ElementAt(tsvIndex).Trim(); var matchedFrags = tsvData["#MatchedFragments"].ElementAt(tsvIndex).Trim(); var aaSet = new AminoAcidSet(); var sequence = Sequence.CreateSequence(seqStr, seqMod, aaSet); var tol = new Tolerance(10); var sequenceFinder = new SequenceTagIndexFinder(tol, 1, 10); var results = sequenceFinder.GetLongestSequence(spectrum, sequence); resultLine += String.Format("{0},{1},{2},{3},{4},{5},{6},{7},{8},\n", scanNum, matchedFrags, seqStr, results.Item1, results.Item2, results.Item3, results.Item4, results.Item5, results.Item6); } return(resultLine); }
private void Rescore(string msAlignFilePath, string outputFilePath) { var parser = new TsvFileParser(msAlignFilePath); var sequences = parser.GetData("Peptide"); var scanNums = parser.GetData("Scan(s)").Select(s => Convert.ToInt32(s)).ToArray(); var charges = parser.GetData("Charge").Select(c => Convert.ToInt32(c)).ToArray(); var rows = parser.GetRows(); var headers = parser.GetHeaders(); using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("{0}\t{1}", string.Join("\t", headers), IcScores.GetScoreNames()); for (var i = 0; i < parser.NumData; i++) { var row = rows[i]; var seqStr = SimpleStringProcessing.GetStringBetweenDots(sequences[i]); if (seqStr == null || seqStr.Contains("(")) { continue; //TODO: currently ignore ids with modifications } var composition = AASet.GetComposition(seqStr); //var sequence = new Sequence(seqStr, AASet); //if (sequence == null) //{ // Console.WriteLine("Ignore illegal sequence: {0}", seqStr); // continue; //} var charge = charges[i]; var scanNum = scanNums[i]; var scores = _topDownScorer.GetScores(AminoAcid.ProteinNTerm, seqStr, AminoAcid.ProteinCTerm, composition, charge, scanNum); if (scores == null) { continue; } writer.WriteLine("{0}\t{1}", row, scores); } } }
public void TestTagAlignedFeatures() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var featureDir = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, "Output"); var mspDir = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"Output\MSP"); var outFile = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"Output\aligned_features.tsv"); var resultFile = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"\Output\aligned_ids.tsv"); if (!Directory.Exists(featureDir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, featureDir); } if (!Directory.Exists(mspDir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, mspDir); } if (!File.Exists(outFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, outFile); } var dataset = GetDataList(featureDir); var tsvParser = new TsvFileParser(outFile); var massList = new List <double>(); for (var i = 0; i < tsvParser.NumData; i++) { massList.Add(Double.Parse(tsvParser.GetData("MonoMass")[i])); } var featureIdMap = new Dictionary <int, string>(); var tolerance = new Tolerance(12); var headers = new List <string>(); //foreach (var data in dataset) for (var d = 0; d < dataset.Count; d++) { var data = dataset[d]; var minScanColName = string.Format("{0}_minScan", d); var maxScanColName = string.Format("{0}_maxScan", d); var fname = string.Format(@"{0}\{1}_IcTda.tsv", mspDir, data); var idParser = new TsvFileParser(fname); var idRows = idParser.GetRows(); if (headers.Count < 1) { headers.AddRange(idParser.GetHeaders()); } for (var i = 0; i < idParser.NumData; i++) { var scan = Int32.Parse(idParser.GetData("Scan")[i]); var mass = Double.Parse(idParser.GetData("Mass")[i]); var qvalue = Double.Parse(idParser.GetData("QValue")[i]); if (qvalue > 0.01) { break; } var massTol = tolerance.GetToleranceAsMz(mass); var idx = massList.BinarySearch(mass); if (idx < 0) { idx = ~idx; } var found = false; for (var j = idx; j >= 0; j--) { if (Math.Abs(mass - massList[j]) > massTol) { break; } if (tsvParser.GetData(minScanColName)[j].Length < 1) { continue; } if (Int32.Parse(tsvParser.GetData(minScanColName)[j]) < scan && scan < Int32.Parse(tsvParser.GetData(maxScanColName)[j])) { found = true; if (!featureIdMap.ContainsKey(j)) { featureIdMap.Add(j, idRows[i]); } break; } } if (found) { continue; } for (var j = idx + 1; j < massList.Count; j++) { if (Math.Abs(mass - massList[j]) > massTol) { break; } if (tsvParser.GetData(minScanColName)[j].Length < 1) { continue; } if (Int32.Parse(tsvParser.GetData(minScanColName)[j]) < scan && scan < Int32.Parse(tsvParser.GetData(maxScanColName)[j])) { found = true; if (!featureIdMap.ContainsKey(j)) { featureIdMap.Add(j, idRows[i]); } break; } } } } var writer = new StreamWriter(resultFile); writer.Write("AlignedFeatureID"); writer.Write("\t"); writer.Write(string.Join("\t", headers)); for (var i = 0; i < 32; i++) { writer.Write("\t"); writer.Write("{0}", i); } writer.Write("\n"); var id = 1; foreach (var key in featureIdMap.Keys) { writer.Write(id); writer.Write("\t"); writer.Write(featureIdMap[key]); for (var i = 0; i < 32; i++) { writer.Write("\t"); writer.Write("{0}", tsvParser.GetData(string.Format("{0}", i))[key]); } writer.Write("\n"); id++; } writer.Close(); }
public void TestCompositeScoring() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); //const string rawFilePath = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\SpecFiles\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; const string rawFilePath = @"D:\MassSpecFiles\training\raw\QC_Shew_Intact_26Sep14_Bane_C2Column3.pbf"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } // Configure amino acid set var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false); var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false); const int numMaxModsPerProtein = 4; var searchModifications = new List <SearchModification> { dehydroC, oxM, acetylN }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); var comparer = new FilteredProteinMassBinning(aaSet, 50000, 28); var run = PbfLcMsRun.GetLcMsRun(rawFilePath); const double filteringWindowSize = 1.1; const int isotopeOffsetTolerance = 2; var tolerance = new Tolerance(10); const int minCharge = 1; const int maxCharge = 20; var graphFactory = new ProteinScoringGraphFactory(comparer, aaSet); var aminoAcidSet = new AminoAcidSet(); //var scorer = new MatchedPeakPostScorer(tolerance, minCharge, maxCharge); var scorer = new InformedTopDownScorer(run, aminoAcidSet, minCharge, maxCharge, tolerance); var fileExt = new string[] { "IcTarget", "IcDecoy" }; foreach (var ext in fileExt) { var resultFileName = string.Format(@"D:\MassSpecFiles\training\Rescoring\QC_Shew_Intact_26Sep14_Bane_C2Column3_{0}.tsv", ext); var parser = new TsvFileParser(resultFileName); var scans = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var charges = parser.GetData("Charge").Select(s => Convert.ToInt32(s)).ToArray(); var protSequences = parser.GetData("Sequence").ToArray(); var modStrs = parser.GetData("Modifications").ToArray(); var compositions = parser.GetData("Composition").Select(Composition.Parse).ToArray(); var protMass = parser.GetData("Mass").Select(s => Convert.ToDouble(s)).ToArray(); var outputFileName = string.Format(@"D:\MassSpecFiles\training\Rescoring\QC_Shew_Intact_26Sep14_Bane_C2Column3_{0}_Rescored.tsv", ext); using (var writer = new StreamWriter(outputFileName)) { writer.WriteLine(string.Join("\t", parser.GetHeaders().ToArray(), 0, 15) + "\tScore\tEValue"); var lines = new string[parser.NumData]; //for (var i = 0; i < parser.NumData; i++) Parallel.For(0, parser.NumData, i => { var scan = scans[i]; var charge = charges[i]; var protSequence = protSequences[i]; var modStr = modStrs[i]; var sequence = Sequence.CreateSequence(protSequence, modStr, aminoAcidSet); Assert.True(sequence.Composition.Equals(compositions[i] - Composition.H2O)); var ms2Spec = run.GetSpectrum(scan) as ProductSpectrum; Assert.True(ms2Spec != null); var scores = scorer.GetScores(sequence, charge, scan); var deconvSpec = Deconvoluter.GetDeconvolutedSpectrum(ms2Spec, minCharge, maxCharge, isotopeOffsetTolerance, filteringWindowSize, tolerance, 0.7); var deconvScorer = new CompositeScorerBasedOnDeconvolutedSpectrum(deconvSpec, ms2Spec, tolerance, comparer); var graph = graphFactory.CreateScoringGraph(deconvScorer, protMass[i]); var gf = new GeneratingFunction(graph); gf.ComputeGeneratingFunction(); var specEvalue = gf.GetSpectralEValue(scores.Score); var rowStr = parser.GetRows()[i]; var items = rowStr.Split('\t').ToArray(); var newRowStr = string.Join("\t", items, 0, 15); //writer.WriteLine("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue); lock (lines) { lines[i] = string.Format("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue); } //Console.WriteLine("{0}\t{1}\t{2}", items[0], scores.Score, specEvalue); }); foreach (var line in lines) { writer.WriteLine(line); } } Console.WriteLine("Done"); } }
public void CreatePeptideAbundanceTableWithSkyline() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); // Reading Henry's results var pepKeySet = new HashSet <string>(); var resultDic = new Dictionary <string, Tuple <double, double> >(); const string henryResultPath = @"H:\Research\IPRG2015\Henry_results\tsv"; if (!Directory.Exists(henryResultPath)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, henryResultPath); } var aaSet = new AminoAcidSet(); foreach (var resultFile in Directory.GetFiles(henryResultPath, "*.tsv")) { var fileName = Path.GetFileName(resultFile); if (fileName == null) { continue; } var sample = fileName.Substring(0, 2); Console.WriteLine("Processing {0}", sample); var tsvReader = new TsvFileParser(resultFile); var peptides = tsvReader.GetData("Peptide").ToArray(); var charge = tsvReader.GetData("Charge").Select(c => Convert.ToInt32(c)).ToArray(); var prob = tsvReader.GetData("Prob").Select(Convert.ToDouble).ToArray(); var qValue = tsvReader.GetData("QValue").Select(Convert.ToDouble).ToArray(); for (var i = 0; i < tsvReader.NumData; i++) { var peptide = peptides[i]; var nominalMass = GetNominalMass(aaSet, peptide); var key = sample + ":" + GetPeptide(peptides[i]) + ":" + nominalMass + ":" + charge[i]; var pepKey = GetPeptide(peptides[i]) + ":" + nominalMass; pepKeySet.Add(pepKey); Tuple <double, double> existingScores; if (resultDic.TryGetValue(key, out existingScores)) { if (prob[i] > existingScores.Item1) { resultDic[key] = new Tuple <double, double>(prob[i], qValue[i]); } } else { resultDic.Add(key, new Tuple <double, double>(prob[i], qValue[i])); } } } const string skylineFilePath = @"H:\Research\IPRG2015\MySkyline\TransitionResults.csv"; if (!File.Exists(skylineFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, skylineFilePath); } var skylineTable = new TsvFileParser(skylineFilePath, ','); const string outputFilePath = @"H:\Research\IPRG2015\MySkyline\SkylineTransitionResultsWithScores3.tsv"; using (var writer = new StreamWriter(outputFilePath)) { var peptides = skylineTable.GetData("Peptide Sequence").ToArray(); var samples = skylineTable.GetData("Replicate Name").Select(s => "" + s[0] + s[2]).ToArray(); var charges = skylineTable.GetData("Precursor Charge").Select(c => Convert.ToInt32(c)).ToArray(); var precursorMzs = skylineTable.GetData("Precursor Mz").Select(Convert.ToDouble).ToArray(); writer.WriteLine("{0}\tProbability\tQValue", string.Join("\t", skylineTable.GetHeaders().Take(skylineTable.GetHeaders().Count - 2))); for (var i = 0; i < skylineTable.NumData; i++) { var precursorMz = precursorMzs[i]; var charge = charges[i]; var nominalMass = (int)Math.Round(((precursorMz - Constants.Proton) * charge - Composition.H2O.Mass) * Constants.RescalingConstant); var pepKey = peptides[i] + ":" + nominalMass; if (!pepKeySet.Contains(pepKey)) { //Console.WriteLine("Removing {0}", pepKey); continue; } var key = samples[i] + ":" + peptides[i] + ":" + nominalMass + ":" + charge; double?prob = null, qValue = null; Tuple <double, double> scores; if (resultDic.TryGetValue(key, out scores)) { prob = scores.Item1; qValue = scores.Item2; } var skylineData = skylineTable.GetRows()[i].Split(','); for (var j = 0; j < skylineData.Length - 2; j++) { if (j != 2) { writer.Write(skylineData[j] + "\t"); } else { writer.Write("" + skylineData[j][0] + skylineData[j][2] + "\t"); } } writer.WriteLine("{0}\t{1}", prob != null ? prob.ToString() : "NA", qValue != null ? qValue.ToString() : "NA"); } } Console.WriteLine("Done"); }
public void TestCompositeScoring() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var pbfFilePath = Utils.GetPbfTestFilePath(false); var pbfFile = Utils.GetTestFile(methodName, pbfFilePath); // Configure amino acid set var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false); var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false); const int numMaxModsPerProtein = 4; var searchModifications = new List <SearchModification> { dehydroC, oxM, acetylN }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); var comparer = new FilteredProteinMassBinning(aaSet, 50000, 28); var run = PbfLcMsRun.GetLcMsRun(pbfFile.FullName); const double filteringWindowSize = 1.1; const int isotopeOffsetTolerance = 2; var tolerance = new Tolerance(10); const int minCharge = 1; const int maxCharge = 20; var graphFactory = new ProteinScoringGraphFactory(comparer, aaSet); var aminoAcidSet = new AminoAcidSet(); //var scorer = new MatchedPeakPostScorer(tolerance, minCharge, maxCharge); var scorer = new InformedTopDownScorer(run, aminoAcidSet, minCharge, maxCharge, tolerance); if (pbfFile.DirectoryName == null) { Assert.Ignore("Ignoring test since cannot determine the parent directory of " + pbfFile.FullName); } var fileExt = new[] { "IcTarget", "IcDecoy" }; foreach (var ext in fileExt) { var resultFileName = Path.Combine(pbfFile.DirectoryName, Path.GetFileNameWithoutExtension(pbfFile.Name)) + string.Format("_{0}.tsv", ext); var parser = new TsvFileParser(resultFileName); var scans = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var charges = parser.GetData("Charge").Select(s => Convert.ToInt32(s)).ToArray(); var protSequences = parser.GetData("Sequence").ToArray(); var modStrs = parser.GetData("Modifications").ToArray(); var compositions = parser.GetData("Composition").Select(Composition.Parse).ToArray(); var protMass = parser.GetData("Mass").Select(s => Convert.ToDouble(s)).ToArray(); var outputFileName = Path.Combine(pbfFile.DirectoryName, Path.GetFileNameWithoutExtension(pbfFile.Name)) + string.Format("_{0}_Rescored.tsv", ext); using (var writer = new StreamWriter(outputFileName)) { writer.WriteLine(string.Join("\t", parser.GetHeaders().ToArray(), 0, 15) + "\tScore\tEValue"); var lines = new string[parser.NumData]; //for (var i = 0; i < parser.NumData; i++) Parallel.For(0, 30, i => { var scan = scans[i]; var charge = charges[i]; var protSequence = protSequences[i]; var modStr = modStrs[i]; var sequence = Sequence.CreateSequence(protSequence, modStr, aminoAcidSet); // Assert.True(sequence.Composition.Equals(compositions[i] - Composition.H2O)); if (!(run.GetSpectrum(scan) is ProductSpectrum ms2Spec)) { Console.WriteLine("Could not get the spectrum datafor scan {0}", scan); }
public void AddMostAbundantIsotopePeakIntensity() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var rawFilePath = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"TestYufengData\QC_ShewIntact_40K_LongSeparation_1_141016155143.raw"); if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test " + methodName + @" since file not found: " + rawFilePath); } var run = PbfLcMsRun.GetLcMsRun(rawFilePath); var resultFilePath = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"TestYufengData\QC_ShewIntact_40K_LongSeparation_1_141016155143_IcTda.tsv"); var parser = new TsvFileParser(resultFilePath); var compositions = parser.GetData("Composition").Select(Composition.Parse).ToArray(); var scanNums = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var charges = parser.GetData("Charge").Select(s => Convert.ToInt32(s)).ToArray(); var precursorIntensities = new double[parser.NumData]; var tolerance = new Tolerance(10); for (var i = 0; i < parser.NumData; i++) { var scanNum = scanNums[i]; var composition = compositions[i]; var charge = charges[i]; var precursorIon = new Ion(composition, charge); var precursorScanNum = run.GetPrecursorScanNum(scanNum); var precursorSpec = run.GetSpectrum(precursorScanNum); var isotopePeaks = precursorSpec.GetAllIsotopePeaks(precursorIon, tolerance, 0.1); if (isotopePeaks != null) { var maxIntensity = 0.0; for (var j = 0; j < isotopePeaks.Length; j++) { if (isotopePeaks[j] != null && isotopePeaks[j].Intensity > maxIntensity) { maxIntensity = isotopePeaks[j].Intensity; } } precursorIntensities[i] = maxIntensity; } } // Writing var newResultFilePath = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"TestYufengData\QC_ShewIntact_40K_LongSeparation_1_141016155143_IcTdaWithIntensities.tsv"); using (var writer = new StreamWriter(newResultFilePath)) { writer.WriteLine(string.Join("\t", parser.GetHeaders()) + "\t" + "PrecursorIntensity"); for (var i = 0; i < parser.NumData; i++) { writer.WriteLine(parser.GetRows()[i] + "\t" + precursorIntensities[i]); } } Console.WriteLine("Done"); }
public void TestCompositeScoring() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var pbfFilePath = Utils.GetPbfTestFilePath(false); var pbfFile = Utils.GetTestFile(methodName, pbfFilePath); // Configure amino acid set var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false); var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false); const int numMaxModsPerProtein = 4; var searchModifications = new List <SearchModification> { dehydroC, oxM, acetylN }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); var comparer = new FilteredProteinMassBinning(aaSet, 50000, 28); var run = PbfLcMsRun.GetLcMsRun(pbfFile.FullName); const double filteringWindowSize = 1.1; const int isotopeOffsetTolerance = 2; var tolerance = new Tolerance(10); const int minCharge = 1; const int maxCharge = 20; var graphFactory = new ProteinScoringGraphFactory(comparer, aaSet); var aminoAcidSet = new AminoAcidSet(); //var scorer = new MatchedPeakPostScorer(tolerance, minCharge, maxCharge); var scorer = new InformedTopDownScorer(run, aminoAcidSet, minCharge, maxCharge, tolerance); if (pbfFile.DirectoryName == null) { Assert.Ignore("Ignoring test since cannot determine the parent directory of " + pbfFile.FullName); } var fileExt = new string[] { "IcTarget", "IcDecoy" }; foreach (var ext in fileExt) { var resultFileName = Path.Combine(pbfFile.DirectoryName, Path.GetFileNameWithoutExtension(pbfFile.Name)) + string.Format("_{0}.tsv", ext); var parser = new TsvFileParser(resultFileName); var scans = parser.GetData("Scan").Select(s => Convert.ToInt32((string)s)).ToArray(); var charges = parser.GetData("Charge").Select(s => Convert.ToInt32(s)).ToArray(); var protSequences = parser.GetData("Sequence").ToArray(); var modStrs = parser.GetData("Modifications").ToArray(); var compositions = parser.GetData("Composition").Select(Composition.Parse).ToArray(); var protMass = parser.GetData("Mass").Select(s => Convert.ToDouble(s)).ToArray(); var outputFileName = Path.Combine(pbfFile.DirectoryName, Path.GetFileNameWithoutExtension(pbfFile.Name)) + string.Format("_{0}_Rescored.tsv", ext); using (var writer = new StreamWriter(outputFileName)) { writer.WriteLine(string.Join("\t", parser.GetHeaders().ToArray(), 0, 15) + "\tScore\tEValue"); var lines = new string[parser.NumData]; //for (var i = 0; i < parser.NumData; i++) Parallel.For(0, 30, i => { var scan = scans[i]; var charge = charges[i]; var protSequence = protSequences[i]; var modStr = modStrs[i]; var sequence = Sequence.CreateSequence(protSequence, modStr, aminoAcidSet); // Assert.True(sequence.Composition.Equals(compositions[i] - Composition.H2O)); var ms2Spec = run.GetSpectrum(scan) as ProductSpectrum; if (ms2Spec == null) { Console.WriteLine("Could not get the spectrum datafor scan {0}", scan); } else { Assert.True(ms2Spec != null); var scores = scorer.GetScores(sequence, charge, scan); var deconvSpec = Deconvoluter.GetDeconvolutedSpectrum(ms2Spec, minCharge, maxCharge, isotopeOffsetTolerance, filteringWindowSize, tolerance, 0.7); var deconvScorer = new CompositeScorerBasedOnDeconvolutedSpectrum(deconvSpec, ms2Spec, tolerance, comparer); var graph = graphFactory.CreateScoringGraph(deconvScorer, protMass[i]); var gf = new GeneratingFunction(graph); gf.ComputeGeneratingFunction(); var specEvalue = gf.GetSpectralEValue(scores.Score); var rowStr = parser.GetRows()[i]; var items = rowStr.Split('\t').ToArray(); var newRowStr = string.Join("\t", items, 0, 15); //writer.WriteLine("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue); lines[i] = string.Format("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue); //Console.WriteLine("{0}\t{1}\t{2}", items[0], scores.Score, specEvalue); } }); foreach (var line in (from item in lines where !string.IsNullOrWhiteSpace(item) select item).Take(20)) { Console.WriteLine(line); } } Console.WriteLine("Done"); } }
public void ProcessIprg2015PreStudy() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dir = @"H:\Research\IPRG2015"; const string databaseFilePath = dir + @"\database\yeast6proteaprotein.fasta"; if (!File.Exists(databaseFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath); } var database = new FastaDatabase(databaseFilePath); database.Read(); const string jobFilePath = dir + @"\Jobs.tsv"; if (!File.Exists(jobFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, jobFilePath); } var jobParser = new TsvFileParser(jobFilePath); var jobs = jobParser.GetData("Jobs").Select(j => Convert.ToInt32(j)).ToArray(); var experiments = jobParser.GetData("Experiments").Select(e => e.Split('_')[2]).ToArray(); //const string resultFilePath = dir + @"\AMT_Proteins_NA.tsv"; //const string outputFilePath = dir + @"\AMT_Proteins.tsv"; const string resultFilePath = dir + @"\AMT_Peptides_NA.tsv"; const string outputFilePath = dir + @"\AMT_Peptides.tsv"; var parser = new TsvFileParser(resultFilePath); var headers = parser.GetHeaders(); var jobColNum = new int[jobs.Length]; for (var i = 0; i < jobs.Length; i++) { for (var j = 0; j < headers.Count; j++) { if (headers[j].Contains("" + jobs[i])) { jobColNum[i] = j; break; } } } for (var i = 0; i < jobs.Length; i++) { Console.WriteLine("{0}\t{1}\t{2}", jobs[i], jobColNum[i], experiments[i]); } using (var writer = new StreamWriter(outputFilePath)) { var peptides = parser.GetData("Peptide"); // Peptides var proteins = parser.GetData("Reference"); // Proteins var abundances = new string[jobs.Length][]; for (var i = 0; i < jobs.Length; i++) { abundances[i] = parser.GetData(headers[jobColNum[i]]).ToArray(); } if (peptides != null) { writer.Write("Peptide\t"); } writer.Write("Protein\tLength"); for (var i = 0; i < jobs.Length; i++) { writer.Write("\t" + experiments[i]); } writer.WriteLine("\tSpikeIn"); for (var i = 0; i < proteins.Count; i++) { var protein = proteins[i]; if (protein.StartsWith("XXX") || protein.StartsWith("Contaminant")) { continue; } var length = database.GetProteinLength(protein); //if (length <= 0) //{ // Console.WriteLine("Shit!"); // return; //} if (peptides != null) { writer.Write(peptides[i] + "\t"); } writer.Write(protein + "\t" + length); for (var j = 0; j < jobs.Length; j++) { writer.Write("\t" + abundances[j][i]); } writer.WriteLine("\t" + (protein.StartsWith("STANDARD") ? 1 : 0)); } } }
public void TestSequenceTag() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); //const string TestRawFile = @"D:\\Vlad_TopDown\\raw\\yufeng_column_test2.raw"; //const string TestResultFile = @"D:\\Vlad_TopDown\\results\\yufeng_column_test2_IcTda.tsv"; const string TestRawFile = @"D:\MassSpecFiles\training\raw\QC_Shew_Intact_26Sep14_Bane_C2Column3.pbf"; const string TestResultFile = @"D:\MassSpecFiles\training\IdResult\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv"; //const string TestRawFile = @"D:\MassSpecFiles\Lewy\Lewy_intact_01.raw"; //const string TestResultFile = @"D:\MassSpecFiles\Lewy\Lewy_intact_01_IcTda.tsv"; if (!File.Exists(TestRawFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, TestRawFile); } if (!File.Exists(TestResultFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, TestResultFile); } // Configure amino acid set var aminoAcidList = new List <AminoAcid>(); foreach (var aa in AminoAcid.StandardAminoAcidArr) { aminoAcidList.Add(aa); aminoAcidList.Add(new ModifiedAminoAcid(aa, Modification.Acetylation)); aminoAcidList.Add(new ModifiedAminoAcid(aa, Modification.Oxidation)); } //const int MaxTags = 100000; var tsvParser = new TsvFileParser(TestResultFile); var headerList = tsvParser.GetHeaders(); var tsvData = tsvParser.GetAllData(); var ms2ScanNumbers = tsvData["Scan"]; var run = PbfLcMsRun.GetLcMsRun(TestRawFile); var nSpec = 0; var nHitSpec = 0; for (var i = 0; i < ms2ScanNumbers.Count; i++) //foreach(var scanNum in targetScans) { var scanNum = Int32.Parse(ms2ScanNumbers[i]); //if (scanNum != 4672) continue; var spectrum = run.GetSpectrum(scanNum) as ProductSpectrum; int tsvIndex = ms2ScanNumbers.FindIndex(x => Int32.Parse(x) == scanNum); var qValue = double.Parse(tsvData["QValue"].ElementAt(tsvIndex)); if (qValue > 0.01) { break; } var seqStr = tsvData["Sequence"].ElementAt(tsvIndex).Trim(); var modStr = tsvData["Modifications"].ElementAt(tsvIndex).Trim(); var tolerance = new Tolerance(5); var tagFinder = new SequenceTagFinder(spectrum, tolerance, 5, 8, aminoAcidList.ToArray()); var nTags = 0; var nHit = 0; var seqOjb = Sequence.CreateSequence(seqStr, modStr, new AminoAcidSet()); var compWithoutH2O = seqOjb.Composition - Composition.H2O; //Console.WriteLine(compWithoutH2O.Mass); foreach (var seqTagStr in tagFinder.GetAllSequenceTagString()) { if (seqStr.Contains(seqTagStr.Sequence)) //|| seqStr.Contains(Reverse(tagStr))) { //var idx = seqStr.IndexOf(seqTagStr.Sequence); //seqStr.Substring(0, idx) /*var comp2 = seqOjb.GetComposition(0, idx); * * Console.Write(comp2.Mass); * Console.Write("\t"); * * Console.Write(seqTagStr.FlankingMass); * Console.Write("\t"); * Console.Write(seqTagStr.Sequence); * Console.Write("\t"); * Console.Write(seqTagStr.IsPrefix); * Console.WriteLine(""); */ if (seqStr.Contains(seqTagStr.Sequence)) { nHit++; } } nTags++; } nSpec++; if (nHit > 0) { nHitSpec++; } Console.WriteLine(@"[{0}]seqLen = {1}: {2}/{3}", scanNum, seqStr.Length, nHit, nTags); } //var existingTags = tagFinder.ExtractExistingSequneceTags(sequence); Console.Write("{0}/{1}", nHitSpec, nSpec); }