public static void FilterForConfidentlyIdentifiedProteinOnly() { List <String> ppr = ProteinProphetEvaluator.extractIdentifiedProteinNames(InputFileOrganizer.OriginalProtXMLFile); String excludedSpectraPerProteinAll = Path.Combine("ProteinTotalSpectraAgainstExcludedSpectra.txt"); String outputFile = Path.Combine(InputFileOrganizer.OutputFolderOfTheRun, "FilteredProteinWithExcludedSpectraCount.txt"); StreamReader sr = new StreamReader(excludedSpectraPerProteinAll); StreamWriter sw = new StreamWriter(outputFile); String line = sr.ReadLine(); sw.WriteLine(line); line = sr.ReadLine(); int count = 0; while (line != null) { String protName = line.Split("\t".ToCharArray())[0]; if (ppr.Contains(protName)) { sw.WriteLine(line); count++; } line = sr.ReadLine(); } sw.Close(); Console.WriteLine(count); }
public static void DoJob() { String originalProt = "C:\\Coding\\2019LavalleeLab\\GitProjectRealTimeMS\\TestData\\PreComputedFiles\\MS_QC_120min_interact.prot.xml"; var protGroups = ProteinProphetEvaluator.ExtractPositiveProteinGroups(originalProt); Console.WriteLine(protGroups.Count); }
private static void PostExperimentProcessing(Experiment e) { //WriterClass.writeln(exclusionProfile.ReportFailedCometSearchStatistics()); WriterClass.Flush(); if (GlobalVar.IsSimulation) { ProteinProphetResult ppr; if (GlobalVar.isSimulationForFeatureExtraction) { ppr = ProteinProphetEvaluator.getProteinProphetResult(InputFileOrganizer.OriginalProtXMLFile); } else { String proteinProphetResultFileName = e.experimentNumber + GlobalVar.experimentName; ppr = PostProcessingScripts.postProcessing(e.exclusionProfile, proteinProphetResultFileName, true); } e.totalRunTime = getCurrentTime() - e.experimentStartTime; String result = e.exclusionProfile.getPerformanceVector(e.experimentName, e.exclusionProfile.getAnalysisType().getDescription() , e.analysisTime, e.totalRunTime, ppr, 12, e.exclusionProfile); Console.WriteLine(result); Console.WriteLine("Protein groups: " + ppr.getFilteredProteinGroups().Count); WriterClass.writeln(result); //WriterClass.writeln("Protein groups: "+ ppr.getFilteredProteinGroups().Count) ; e.ppr = ppr; } else { WriterClass.writeln(e.exclusionProfile.GetPerformanceEvaluator().outputPerformance()); } }
static void PreExperimentSetUp() { ConstructDecoyFasta(); ConstructIDX(); if (GlobalVar.IsSimulation) //if(false) { ms2SpectraList = Loader.parseMS2File(InputFileOrganizer.MS2SimulationTestFile).getSpectraArray(); GlobalVar.ExperimentTotalScans = ms2SpectraList.Count; FullPepXMLAndProteinProphetSetup(); baseLinePpr = ProteinProphetEvaluator.getProteinProphetResult(InputFileOrganizer.OriginalProtXMLFile); //so in alex's original code, "original experiment" refers to original experiment without any exclusion or manipulation with this program //"baseline comparison" refers to the results after "NoExclusion" run, which is a top 6 or top 12 DDA run, which is not implemented in this program //So the two are the same in thie program int numMS2Analyzed = (int)GlobalVar.ExperimentTotalScans; PerformanceEvaluator.setBaselineComparison(baseLinePpr, numMS2Analyzed, 12); PerformanceEvaluator.setOriginalExperiment(baseLinePpr.getNum_proteins_identified()); } log.Debug("Setting up Database"); database = databaseSetUp(InputFileOrganizer.ExclusionDBFasta); log.Debug("Done setting up database."); CometSingleSearch.InitializeComet(InputFileOrganizer.IDXDataBase, InputFileOrganizer.CometParamsFile); //CometSingleSearch.InitializeComet_NonRealTime("C:\\Coding\\2019LavalleeLab\\temp2\\ModifiedDBSearchFiles\\NoExclusionRealTimeCometSearch.tsv"); //CometSingleSearch.QualityCheck(); Console.WriteLine("pre-experimental setup finished"); }
public static void DoJob() { StreamWriter sw = new StreamWriter(Path.Combine(InputFileOrganizer.OutputFolderOfTheRun, "ExcludedProteinComparison.txt")); String ExcludedProteinFile = "C:\\Coding\\2019LavalleeLab\\temp2\\Output\\Gold_MLGE_nonCheat.txt_output\\ExcludedProteinList.txt"; StreamReader sr = new StreamReader(ExcludedProteinFile); List <String> inProgramConfidentlyIdentified = new List <String>(); String line = sr.ReadLine(); while (line != null) { inProgramConfidentlyIdentified.Add(line); line = sr.ReadLine(); } String proteinProphetFile = "C:\\Coding\\2019LavalleeLab\\GitProjectRealTimeMS\\TestData\\PreComputedFiles\\MS_QC_120min_interact.prot.xml"; ProteinProphetResult ppr = ProteinProphetEvaluator.getProteinProphetResult(proteinProphetFile); List <String> realConfidentIdentified = ppr.getProteinsIdentified(); List <String> intersection = ListUtil.FindIntersection(inProgramConfidentlyIdentified, realConfidentIdentified); sw.WriteLine("In-Program excluded: {0}", inProgramConfidentlyIdentified.Count); sw.WriteLine("Real confidently identified: {0}", realConfidentIdentified.Count); sw.WriteLine("Intersection: {0}", intersection.Count); sw.Close(); }
static String mzml = "C:\\Coding\\2019LavalleeLab\\RealTest_Results_20200219\\MSQC_QE_200ng_HEK_2hr_to_run_200219172225.mzML"; //"C:\\Coding\\2019LavalleeLab\\GoldStandardData\\MZML_Files\\MS_QC_120min.mzml"; public static void DoJob() { //comet Console.WriteLine("Performing Comet search on full ms2 data"); String fullCometFile = PostProcessingScripts.CometStandardSearch(ms2File, InputFileOrganizer.OutputFolderOfTheRun, true); InputFileOrganizer.OriginalCometOutput = fullCometFile; //protein prophet Console.WriteLine("Perform a protein prophet search on full pepxml"); String fullProteinProphetFile = PostProcessingScripts.ProteinProphetSearch(fullCometFile, InputFileOrganizer.OutputFolderOfTheRun, true); InputFileOrganizer.OriginalProtXMLFile = fullProteinProphetFile; ProteinProphetResult baseLinePpr = ProteinProphetEvaluator.getProteinProphetResult(InputFileOrganizer.OriginalProtXMLFile); //load spectra Console.WriteLine("loading spectra array"); List <Spectra> ls = Loader.parseMS2File(ms2File).getSpectraArray(); List <int> includedSpectra = new List <int>(); List <int> excludedSpectra = new List <int>(); StreamReader sr = new StreamReader(excludedSpectraFile); String line = sr.ReadLine(); while (line != null) { int excluded = int.Parse(line); excludedSpectra.Add(excluded); line = sr.ReadLine(); } foreach (Spectra sp in ls) { if (!excludedSpectra.Contains(sp.getScanNum())) { includedSpectra.Add(sp.getScanNum()); } } String outputCometFile = Path.Combine(InputFileOrganizer.OutputFolderOfTheRun, "realTestpartialOut.pep.xml"); //"C:\\Coding\\2019LavalleeLab\\GoldStandardData\\pepxml\\MS_QC_120min_partial.pep.xml"; String fastaFile = InputFileOrganizer.FASTA_FILE; //"C:\\Coding\\2019LavalleeLab\\GoldStandardData\\Database\\uniprot_SwissProt_Human_1_11_2017.fasta"; PartialPepXMLWriter.writePartialPepXMLFile(fullCometFile, includedSpectra, outputCometFile, mzml, fastaFile, outputCometFile); String partialProt = PostProcessingScripts.ProteinProphetSearch(outputCometFile, InputFileOrganizer.OutputFolderOfTheRun, true); ProteinProphetResult partialPpr = ProteinProphetEvaluator.getProteinProphetResult(partialProt); double partialNum = partialPpr.getNum_proteins_identified(); double totalNum = baseLinePpr.getNum_proteins_identified(); double idSens = partialNum / totalNum * 100.0; double includedScanNum = includedSpectra.Count; double totalScanNum = ls.Count; double usedResource = includedScanNum / totalScanNum * 100; String line1 = String.Format("includedScans {0} \t totalScanNum {1} \tUsedResources {2}", includedScanNum, totalScanNum, usedResource); String line2 = String.Format("partialNum {0} \t totalNum {1} \tidsens {2}", partialNum, totalNum, idSens); Console.WriteLine(line1); Console.WriteLine(line2); WriterClass.writeln(line1); WriterClass.writeln(line2); WriterClass.CloseWriter(); }
private static void writeFeatures(List <IdentificationFeatures> idf) { log.Info("Classifying positive and negative sets"); // Extract which proteins were confidently identified at 0.01 FDR with protein prophet List <String> identifiedProteins = ProteinProphetEvaluator.extractIdentifiedProteinNames(InputFileOrganizer.OriginalProtXMLFile); // Extract which proteins were not confidently identified, with a specified FDR // threshold List <String> negativeTrainingSetProteins = ProteinProphetEvaluator.extractNegativeTrainingSetProteinNames(InputFileOrganizer.OriginalProtXMLFile, 0.25); // 2019-05-23 FOUND IT! Here is where we filter the negative training set with // above 20% FDR // Proteins identified with a 0.01 FDR with protein prophet List <IdentificationFeatures> positiveTrainingSet = new List <IdentificationFeatures>(); // Proteins not identified with a 0.01 FDR protein prophet List <IdentificationFeatures> negativeTrainingSet = new List <IdentificationFeatures>(); List <IdentificationFeatures> nonPositiveTrainingSet = new List <IdentificationFeatures>(); // Determine which features are in positive or negative training set foreach (IdentificationFeatures i in idf) { String accession = i.getAccession(); if (i.getCardinality() > 0) { if (!accession.StartsWith(GlobalVar.DecoyPrefix)) { //if this is a real protein if (identifiedProteins.Contains(accession)) { positiveTrainingSet.Add(i); } else { nonPositiveTrainingSet.Add(i); } if (negativeTrainingSetProteins.Contains(accession)) { negativeTrainingSet.Add(i); } } else { //if it's a decoy protein negativeTrainingSet.Add(i); nonPositiveTrainingSet.Add(i); } } } WriteIdentificationFeaturesFile(OutputFile_PositiveAndNegative, positiveTrainingSet, negativeTrainingSet); WriteIdentificationFeaturesFile(OutputFile_PositiveAndNonPositive, positiveTrainingSet, nonPositiveTrainingSet); List <IdentificationFeatures> positiveSetNoDecoy = new List <IdentificationFeatures>(); List <IdentificationFeatures> nonPositiveSetNoDecoy = new List <IdentificationFeatures>(); foreach (IdentificationFeatures i in positiveTrainingSet) { if (!i.getAccession().Contains(GlobalVar.DecoyPrefix)) { positiveSetNoDecoy.Add(i); } } foreach (IdentificationFeatures i in nonPositiveTrainingSet) { if (!i.getAccession().Contains(GlobalVar.DecoyPrefix)) { nonPositiveSetNoDecoy.Add(i); } } WriteIdentificationFeaturesFile(OutputFile_PositiveAndNonPositive_NoDecoy, positiveSetNoDecoy, nonPositiveSetNoDecoy); }