public static void FilterForConfidentlyIdentifiedProteinOnly() { List <String> ppr = ProteinProphetEvaluator.extractIdentifiedProteinNames(InputFileOrganizer.OriginalProtXMLFile); String excludedSpectraPerProteinAll = Path.Combine("ProteinTotalSpectraAgainstExcludedSpectra.txt"); String outputFile = Path.Combine(InputFileOrganizer.OutputFolderOfTheRun, "FilteredProteinWithExcludedSpectraCount.txt"); StreamReader sr = new StreamReader(excludedSpectraPerProteinAll); StreamWriter sw = new StreamWriter(outputFile); String line = sr.ReadLine(); sw.WriteLine(line); line = sr.ReadLine(); int count = 0; while (line != null) { String protName = line.Split("\t".ToCharArray())[0]; if (ppr.Contains(protName)) { sw.WriteLine(line); count++; } line = sr.ReadLine(); } sw.Close(); Console.WriteLine(count); }
private static void writeFeatures(List <IdentificationFeatures> idf) { log.Info("Classifying positive and negative sets"); // Extract which proteins were confidently identified at 0.01 FDR with protein prophet List <String> identifiedProteins = ProteinProphetEvaluator.extractIdentifiedProteinNames(InputFileOrganizer.OriginalProtXMLFile); // Extract which proteins were not confidently identified, with a specified FDR // threshold List <String> negativeTrainingSetProteins = ProteinProphetEvaluator.extractNegativeTrainingSetProteinNames(InputFileOrganizer.OriginalProtXMLFile, 0.25); // 2019-05-23 FOUND IT! Here is where we filter the negative training set with // above 20% FDR // Proteins identified with a 0.01 FDR with protein prophet List <IdentificationFeatures> positiveTrainingSet = new List <IdentificationFeatures>(); // Proteins not identified with a 0.01 FDR protein prophet List <IdentificationFeatures> negativeTrainingSet = new List <IdentificationFeatures>(); List <IdentificationFeatures> nonPositiveTrainingSet = new List <IdentificationFeatures>(); // Determine which features are in positive or negative training set foreach (IdentificationFeatures i in idf) { String accession = i.getAccession(); if (i.getCardinality() > 0) { if (!accession.StartsWith(GlobalVar.DecoyPrefix)) { //if this is a real protein if (identifiedProteins.Contains(accession)) { positiveTrainingSet.Add(i); } else { nonPositiveTrainingSet.Add(i); } if (negativeTrainingSetProteins.Contains(accession)) { negativeTrainingSet.Add(i); } } else { //if it's a decoy protein negativeTrainingSet.Add(i); nonPositiveTrainingSet.Add(i); } } } WriteIdentificationFeaturesFile(OutputFile_PositiveAndNegative, positiveTrainingSet, negativeTrainingSet); WriteIdentificationFeaturesFile(OutputFile_PositiveAndNonPositive, positiveTrainingSet, nonPositiveTrainingSet); List <IdentificationFeatures> positiveSetNoDecoy = new List <IdentificationFeatures>(); List <IdentificationFeatures> nonPositiveSetNoDecoy = new List <IdentificationFeatures>(); foreach (IdentificationFeatures i in positiveTrainingSet) { if (!i.getAccession().Contains(GlobalVar.DecoyPrefix)) { positiveSetNoDecoy.Add(i); } } foreach (IdentificationFeatures i in nonPositiveTrainingSet) { if (!i.getAccession().Contains(GlobalVar.DecoyPrefix)) { nonPositiveSetNoDecoy.Add(i); } } WriteIdentificationFeaturesFile(OutputFile_PositiveAndNonPositive_NoDecoy, positiveSetNoDecoy, nonPositiveSetNoDecoy); }