public void TestFeatureAlignment() { const string outFilePath = @"\\protoapps\UserData\Jungkap\Lewy\aligned\promex_crosstab_temp.tsv"; //CPTAC_Intact_CR32A_24Aug15_Bane_15-02-06-RZ var prsmReader = new ProteinSpectrumMatchReader(); var tolerance = new Tolerance(10); var alignment = new LcMsFeatureAlignment(new AnalysisCompRef.CompRefFeatureComparer(tolerance)); for (var i = 0; i < NdataSet; i++) { var rawFile = string.Format(@"{0}\{1}.pbf", PbfPath, GetDataSetNames(i)); var mspFile = string.Format(@"{0}\{1}_IcTda.tsv", MsPfFolder, GetDataSetNames(i)); var mspFile2 = string.Format(@"{0}\{1}_IcTda.tsv", MsPfFolder2, GetDataSetNames(i)); var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", Ms1FtFolder, GetDataSetNames(i)); Console.WriteLine(rawFile); var run = PbfLcMsRun.GetLcMsRun(rawFile); var prsmList1 = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder); var prsmList2 = prsmReader.LoadIdentificationResult(mspFile2, ProteinSpectrumMatch.SearchTool.MsPathFinder); prsmList1.AddRange(prsmList2); var prsmList = MergePrsm(prsmList1); var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run); for (var j = 0; j < prsmList.Count; j++) { var match = prsmList[j]; match.ProteinId = match.ProteinName; } // tag features by PrSMs for (var j = 0; j < features.Count; j++) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsTh(features[j].Mass); foreach (var match in prsmList) { if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol) { features[j].ProteinSpectrumMatches.Add(match); } } } alignment.AddDataSet(i, features, run); } alignment.AlignFeatures(); Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures); for (var i = 0; i < NdataSet; i++) { alignment.FillMissingFeatures(i); Console.WriteLine("{0} has been processed", GetDataSetNames(i)); } OutputCrossTabWithId(outFilePath, alignment); }
public static ICollection <ProteinSpectrumMatchSet> CollectTrainSet(string pbfFilePath, string idFilePath) { Modification.RegisterAndGetModification(Modification.Cysteinyl.Name, Modification.Cysteinyl.Composition); var prsmReader = new ProteinSpectrumMatchReader(0.01); var prsmList = prsmReader.LoadIdentificationResult(idFilePath); var run = PbfLcMsRun.GetLcMsRun(pbfFilePath); var groupedPrsmList = GroupingByPrsm(0, prsmList, new PrsmComparer(run)); var finalPrsmGroups = new List <ProteinSpectrumMatchSet>(); foreach (var prsmSet in groupedPrsmList) { if (prsmSet.Count < 2) { continue; } var isGood = false; var sequence = prsmSet[0].GetSequence(); if (sequence == null) { continue; } foreach (var scan in prsmSet.Select(prsm => prsm.ScanNum)) { var spectrum = run.GetSpectrum(scan) as ProductSpectrum; if (spectrum == null) { continue; } if (IsGoodTarget(spectrum, sequence)) { isGood = true; break; } } if (isGood) { finalPrsmGroups.Add(prsmSet); } } return(finalPrsmGroups); }
public void TestGenerateFrequencyData() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string idFileFolder = @"D:\MassSpecFiles\training\IdScoring\MSPF_trainset"; const string outFileFolder = @"D:\MassSpecFiles\training\IdScoring"; if (!Directory.Exists(idFileFolder)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, idFileFolder); } Modification.RegisterAndGetModification(Modification.Cysteinyl.Name, Modification.Cysteinyl.Composition); Modification.RegisterAndGetModification(Modification.Phosphorylation.Name, Modification.Phosphorylation.Composition); Modification.RegisterAndGetModification(Modification.Methylation.Name, Modification.Methylation.Composition); Modification.RegisterAndGetModification(Modification.DiMethylation.Name, Modification.DiMethylation.Composition); Modification.RegisterAndGetModification(Modification.TriMethylation.Name, Modification.TriMethylation.Composition); Modification.RegisterAndGetModification("Trioxidation", new Composition(0, 0, 0, 3, 0)); var aaSet = new AminoAcidSet(@"D:\MassSpecFiles\training\Mods.txt"); var n = 0; for (var d = 0; d < TrainSetFileLists.Length; d++) { var dataset = TrainSetFileLists[d]; var dataname = Path.GetFileNameWithoutExtension(dataset); var idFile = string.Format(@"{0}\{1}_IcTda.tsv", idFileFolder, dataname); var decoyFile = string.Format(@"{0}\{1}_IcDecoy.tsv", idFileFolder, dataname); var targetFile = string.Format(@"{0}\{1}_IcTarget.tsv", idFileFolder, dataname); if (!File.Exists(idFile)) continue; var prsmReader = new ProteinSpectrumMatchReader(0.01); var prsmList = prsmReader.LoadIdentificationResult(idFile); var minScore = prsmList.Last().Score; var decoyMatches = prsmReader.ReadMsPathFinderResult(decoyFile, int.MaxValue, 1, Math.Max(minScore - 5, 10)); var run = PbfLcMsRun.GetLcMsRun(dataset); var spectrumMatchSet = LcMsFeatureTrain.CollectTrainSet(dataset, idFile); Console.WriteLine(spectrumMatchSet.Count); var writer = new StreamWriter(string.Format(@"{0}\{1}_target.tsv", outFileFolder, dataname)); foreach (var matches in spectrumMatchSet) { foreach (var match in matches) { var spec = run.GetSpectrum(match.ScanNum) as ProductSpectrum; GetMatchStatistics(spec, match.GetSequence(), match.Charge, writer); } } writer.Close(); writer = new StreamWriter(string.Format(@"{0}\{1}_decoy.tsv", outFileFolder, dataname)); foreach (var match in decoyMatches) { var sequence = match.GetSequence(); var spec = run.GetSpectrum(match.ScanNum) as ProductSpectrum; GetMatchStatistics(spec, sequence, match.Charge, writer); } writer.Close(); n++; } }
public void FindMissingLcMsFeatures() { var mspfFolder = @"D:\MassSpecFiles\CompRef_Kelleher\Study3"; var ms1ftFolder = @"D:\MassSpecFiles\CompRef_Kelleher\Study3"; const int Nfraction1 = 3; const int Nfraction2 = 5; for (var frac1 = 1; frac1 <= Nfraction1; frac1++) { for (var frac2 = 1; frac2 <= Nfraction2; frac2++) { var datasets = GetDataSetNamesStudy3(frac1, frac2); //var outFilePath = string.Format(@"D:\MassSpecFiles\CompRef_Kelleher\study3_GFrep{0}_Gfrac{1}.tsv", frac1.ToString("D2"), frac2.ToString("D2")); var nDataset = datasets.Count; var prsmReader = new ProteinSpectrumMatchReader(); var tolerance = new Tolerance(12); for (var i = 0; i < nDataset; i++) { var rawFile = string.Format(@"{0}\{1}.pbf", PbfPath, datasets[i]); var mspFile = string.Format(@"{0}\{1}_IcTda.tsv", mspfFolder, datasets[i]); var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", ms1ftFolder, datasets[i]); var outPath = string.Format(@"{0}\{1}.seqtag.ms1ft", ms1ftFolder, datasets[i]); if (File.Exists(outPath)) continue; var run = PbfLcMsRun.GetLcMsRun(rawFile); var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run); var prsmList = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder); var prsmFeatureMatch = new bool[prsmList.Count]; for (var j = 0; j < features.Count; j++) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsTh(features[j].Mass); for (var k = 0; k < prsmList.Count; k++) { var match = prsmList[k]; if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol) { features[j].ProteinSpectrumMatches.Add(match); prsmFeatureMatch[k] = true; } } } var missingPrsm = new List<ProteinSpectrumMatch>(); for (var k = 0; k < prsmList.Count; k++) if (!prsmFeatureMatch[k]) missingPrsm.Add(prsmList[k]); FeatureFind(missingPrsm, run, outPath); Console.WriteLine(outPath); } } } }
public void AlignFeatures(List<string> datasets, string mspfFolder, string ms1ftFolder, string outFilePath) { var nDataset = datasets.Count; var prsmReader = new ProteinSpectrumMatchReader(); var tolerance = new Tolerance(12); var alignment = new LcMsFeatureAlignment(new AnalysisCompRef.CompRefFeatureComparer(tolerance)); for (var i = 0; i < nDataset; i++) { var rawFile = string.Format(@"{0}\{1}.pbf", PbfPath, datasets[i]); var mspFile = string.Format(@"{0}\{1}_IcTda.tsv", mspfFolder, datasets[i]); var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", ms1ftFolder, datasets[i]); var ms1FtFile2 = string.Format(@"{0}\{1}.seqtag.ms1ft", ms1ftFolder, datasets[i]); var run = PbfLcMsRun.GetLcMsRun(rawFile); var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run); var features2 = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile2, run); features.AddRange(features2); if (File.Exists(mspFile)) { var prsmList = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder); //var prsmFeatureMatch = new bool[prsmList.Count]; for (var j = 0; j < prsmList.Count; j++) { var match = prsmList[j]; match.ProteinId = match.ProteinName; } // tag features by PrSMs for (var j = 0; j < features.Count; j++) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsTh(features[j].Mass); for(var k = 0; k < prsmList.Count; k++) { var match = prsmList[k]; if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol) { features[j].ProteinSpectrumMatches.Add(match); //prsmFeatureMatch[k] = true; } } } } alignment.AddDataSet(i, features, run); } alignment.AlignFeatures(); Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures); for (var i = 0; i < nDataset; i++) { alignment.FillMissingFeatures(i); Console.WriteLine("{0} has been processed", datasets[i]); } AnalysisCompRef.OutputCrossTabWithId(outFilePath, alignment, datasets.ToArray()); }