public void CollectTrainingSet() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string idFileFolder = @"D:\MassSpecFiles\training\IdResult"; const string outFileFolder = @"D:\MassSpecFiles\training\FilteredIdResult"; if (!Directory.Exists(idFileFolder)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, idFileFolder); } for (var d = 0; d < TrainSetFileLists.Length; d++) { var dataset = TrainSetFileLists[d]; var dataname = Path.GetFileNameWithoutExtension(dataset); var idFile = string.Format(@"{0}\{1}_IcTda.tsv", idFileFolder, dataname); var outFileName = string.Format(@"{0}\{1}.trainset.tsv", outFileFolder, Path.GetFileNameWithoutExtension(dataset)); if (File.Exists(outFileName)) { continue; } Console.WriteLine(dataset); if (!File.Exists(idFile)) { idFile = string.Format(@"{0}\{1}_msgfdb_syn.txt", idFileFolder, dataname); if (!File.Exists(idFile)) { Console.WriteLine(@"Skipping file since not found: " + idFile); continue; } } Console.WriteLine(idFile); var targetSets = LcMsFeatureTrain.CollectTrainSet(dataset, idFile); Console.WriteLine(targetSets.Count); var writer = new StreamWriter(outFileName); writer.WriteLine("MinScan\tMaxScan\tMinCharge\tMaxCharge\tMass\tSequence\tModifications"); foreach (var prsmSet in targetSets) { writer.Write(prsmSet.MinScanNum); writer.Write("\t"); writer.Write(prsmSet.MaxScanNum); writer.Write("\t"); writer.Write(prsmSet.MinCharge); writer.Write("\t"); writer.Write(prsmSet.MaxCharge); writer.Write("\t"); writer.Write(prsmSet.Mass); writer.Write("\t"); writer.Write(prsmSet[0].Sequence); writer.Write("\t"); writer.Write(prsmSet[0].Modifications); //writer.Write("\t") //writer.Write(string.Join(";", prsmSet.Select(prsm => prsm.ScanNum))); writer.Write("\n"); } writer.Close(); } }
public void TestGenerateFrequencyData() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const string idFileFolder = @"D:\MassSpecFiles\training\IdScoring\MSPF_trainset"; const string outFileFolder = @"D:\MassSpecFiles\training\IdScoring"; if (!Directory.Exists(idFileFolder)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, idFileFolder); } Modification.RegisterAndGetModification(Modification.Cysteinyl.Name, Modification.Cysteinyl.Composition); Modification.RegisterAndGetModification(Modification.Phosphorylation.Name, Modification.Phosphorylation.Composition); Modification.RegisterAndGetModification(Modification.Methylation.Name, Modification.Methylation.Composition); Modification.RegisterAndGetModification(Modification.DiMethylation.Name, Modification.DiMethylation.Composition); Modification.RegisterAndGetModification(Modification.TriMethylation.Name, Modification.TriMethylation.Composition); Modification.RegisterAndGetModification("Trioxidation", new Composition(0, 0, 0, 3, 0)); // var aaSet = new AminoAcidSet(@"D:\MassSpecFiles\training\Mods.txt"); foreach (var dataset in TrainSetFileLists) { var dataname = Path.GetFileNameWithoutExtension(dataset); var idFile = string.Format(@"{0}\{1}_IcTda.tsv", idFileFolder, dataname); var decoyFile = string.Format(@"{0}\{1}_IcDecoy.tsv", idFileFolder, dataname); // var targetFile = string.Format(@"{0}\{1}_IcTarget.tsv", idFileFolder, dataname); if (!File.Exists(idFile)) { continue; } var prsmReader = new ProteinSpectrumMatchReader(0.01); var prsmList = prsmReader.LoadIdentificationResult(idFile); var minScore = prsmList.Last().Score; var decoyMatches = prsmReader.ReadMsPathFinderResult(decoyFile, int.MaxValue, 1, Math.Max(minScore - 5, 10)); var run = PbfLcMsRun.GetLcMsRun(dataset); var spectrumMatchSet = LcMsFeatureTrain.CollectTrainSet(dataset, idFile); Console.WriteLine(spectrumMatchSet.Count); var writer = new StreamWriter(string.Format(@"{0}\{1}_target.tsv", outFileFolder, dataname)); foreach (var matches in spectrumMatchSet) { foreach (var match in matches) { var spec = run.GetSpectrum(match.ScanNum) as ProductSpectrum; GetMatchStatistics(spec, match.GetSequence(), match.Charge, writer); } } writer.Close(); writer = new StreamWriter(string.Format(@"{0}\{1}_decoy.tsv", outFileFolder, dataname)); foreach (var match in decoyMatches) { var sequence = match.GetSequence(); var spec = run.GetSpectrum(match.ScanNum) as ProductSpectrum; GetMatchStatistics(spec, sequence, match.Charge, writer); } writer.Close(); } }