public static IDataView LoadData(MLContext mlContext, String path) { DataTable dt = IdentificationFeatureExtractionUtil.loadDataTable(path); IDataView dataView = IdentificationFeatureExtractionUtil.transformFeatures(dt, true); //TrainTestData splitDataView = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.2); return(dataView); }
//static string mzmlFileBaseName="MS_QC_240min"; public static void ExtractFeatures(String ms2File, out String extractedFeatureSavedFile_posAndNeg, out String extractedFeatureSavedFile_posAndNonPos) { Console.WriteLine("Extracting features from {0}", ms2File); InputFileOrganizer.MS2SimulationTestFile = ms2File; //InputFileOrganizer.MZMLSimulationTestFile = mzmlFile; String ms2FileBaseName = Path.GetFileNameWithoutExtension(ms2File); OutputFile_PositiveAndNegative = Path.Combine(InputFileOrganizer.OutputFolderOfTheRun, ms2FileBaseName + "_extractedFeatures_PositiveAndNegative.tsv"); OutputFile_PositiveAndNonPositive = Path.Combine(InputFileOrganizer.OutputFolderOfTheRun, ms2FileBaseName + "_extractedFeatures_positiveAndNonPositive.tsv"); OutputFile_PositiveAndNonPositive_NoDecoy = Path.Combine(InputFileOrganizer.OutputFolderOfTheRun, ms2FileBaseName + "_extractedFeatures_positiveAndNonPositive_NoDecoy.tsv"); //the current feature extraction will include decoy proteins in the database and testing set SimulationWithDecoyParamsSetUp(); //placeholder values, dont matter GlobalVar.ppmTolerance = 1; GlobalVar.retentionTimeWindowSize = 1; GlobalVar.AccordThreshold = 1; GlobalVar.XCorr_Threshold = 1; GlobalVar.NumDBThreshold = 1; // log.Info("Running No Exclusion Simulation"); ExclusionProfile exclusionProfile = ExclusionExplorer.SingleSimulationRun(ExclusionProfileEnum.NO_EXCLUSION_PROFILE); log.Info("Extracting identification feature from exclusion profile"); List <IdentificationFeatures> idf = exclusionProfile.getFeatures(); log.Info("Recalibrating stDev"); idf = IdentificationFeatureExtractionUtil.recalibrateStDev(idf); writeFeatures(idf); extractedFeatureSavedFile_posAndNeg = OutputFile_PositiveAndNegative; extractedFeatureSavedFile_posAndNonPos = OutputFile_PositiveAndNonPositive; Console.WriteLine("Extracted Feature written to {0} and {1}", OutputFile_PositiveAndNegative, OutputFile_PositiveAndNonPositive); }
protected void evaluateIdentification(IDs id) { // check if the peptide is identified or not if (id == null) { performanceEvaluator.countMS2UnidentifiedAnalyzed(); return; } Peptide pep = getPeptideFromIdentification(id); // id is null, it already returned // add decoy or non-existent protein connections // database.AddProteinFromIdentification(pep, id.getParentProteinAccessions()); Double xCorr = id.getXCorr(); Double dCN = id.getDeltaCN(); pep.addScore(xCorr, dCN); #if (!DONTEVALUATE) performanceEvaluator.evaluateAnalysis(exclusionList, pep); #endif // exclude this peptide for analysis if the xCorr score is above a threshold const double XCORR_THRESHOLD = 2.5; // add the peptide to the exclusion list if it is over the xCorr threshold if ((xCorr > XCORR_THRESHOLD)) { performanceEvaluator.countPeptidesExcluded(); log.Debug("xCorrThreshold passed. Peptide added to the exclusion list."); exclusionList.addPeptide(pep); // calibrates our retention time alignment if the observed time is different // from the predicted only if it passes this threshold calibrateRetentionTime(pep); } // Add all the peptides corresponding to the parent protein, if the parent // protein is deemed confidently identified by the logisitc regression // classifier Dictionary <String, Boolean> identificationPredictions = IdentificationFeatureExtractionUtil .assessProteinIdentificationConfidence(pep.getProteins(), lrAccord); List <Protein> proteinsToExclude = new List <Protein>(); foreach (Protein parentProtein in pep.getProteins()) { // prevents repeated exclusion of a protein already excluded if ((!parentProtein.IsExcluded())) { // determine if parent protein is confidently identified bool isConfidentlyIdentified = identificationPredictions[parentProtein.getAccession()]; if (isConfidentlyIdentified) { // exclude all peptides of that protein parentProtein.setExcluded(true); log.Debug("Parent protein " + parentProtein.getAccession() + " is identified confidently " + parentProtein.getNumDB() + " times!"); performanceEvaluator.countProteinsExcluded(); proteinsToExclude.Add(parentProtein); } } } exclusionList.addProteins(proteinsToExclude); }
protected void evaluateIdentification(IDs id) { // check if the peptide is identified or not if (id == null) { performanceEvaluator.countMS2UnidentifiedAnalyzed(); return; } Peptide pep = getPeptideFromIdentification(id); // id is null, it already returned //log.Info("Peptide Observed Time: {0}\tPredicted Time: {1} -----------------", id.getScanTime(),pep.getRetentionTime().getRetentionTimeStart()); // add decoy or non-existent protein connections // database.AddProteinFromIdentification(pep, id.getParentProteinAccessions()); Double xCorr = id.getXCorr(); double dCN = id.getDeltaCN(); pep.addScore(xCorr, dCN); #if (!DONTEVALUATE) performanceEvaluator.evaluateAnalysis(exclusionList, pep); #endif //RetentionTime rt = pep.getRetentionTime(); //if (!rtCalcPredictedRT.Keys.Contains(pep.getSequence())) //{ // rtCalcPredictedRT.Add(pep.getSequence(), rt.getRetentionTimePeak()); //} //double[] values = new double[] { id.getScanTime(), id.getXCorr(), rt.getRetentionTimePeak(), rt.getRetentionTimeStart() + GlobalVar.retentionTimeWindowSize, RetentionTime.getRetentionTimeOffset(), rtCalcPredictedRT[pep.getSequence()], rt.IsPredicted() ? 1 : 0 }; // exclude this peptide for analysis if the xCorr score is above a threshold const double XCORR_THRESHOLD = 2.5; // add the peptide to the exclusion list if it is over the xCorr threshold if ((xCorr > XCORR_THRESHOLD)) { performanceEvaluator.countPeptidesExcluded(); log.Debug("xCorrThreshold passed. Peptide added to the exclusion list."); exclusionList.addPeptide(pep); // calibrates our retention time alignment if the observed time is different // from the predicted only if it passes this threshold calibrateRetentionTime(pep); } // Add all the peptides corresponding to the parent protein, if the parent // protein is deemed confidently identified by the logisitc regression // classifier Dictionary <String, Boolean> identificationPredictions = IdentificationFeatureExtractionUtil .assessProteinIdentificationConfidence(pep.getProteins(), lrAccord); List <Protein> proteinsToExclude = new List <Protein>(); foreach (Protein parentProtein in pep.getProteins()) { // prevents repeated exclusion of a protein already excluded if ((!parentProtein.IsExcluded())) { // determine if parent protein is confidently identified bool isConfidentlyIdentified = identificationPredictions[parentProtein.getAccession()]; if (isConfidentlyIdentified) { // exclude all peptides of that protein #if TRACKEXCLUDEDPROTEINFEATURE excludedProteinFeatureList.Add(parentProtein.vectorize().ItemArray); #endif parentProtein.setExcluded(true); log.Debug("Parent protein " + parentProtein.getAccession() + " is identified confidently " + parentProtein.getNumDB() + " times!"); performanceEvaluator.countProteinsExcluded(); proteinsToExclude.Add(parentProtein); } } } exclusionList.addProteins(proteinsToExclude); }
public DataRow vectorize() { return(IdentificationFeatureExtractionUtil.extractFeatureVector(accession, peptideScores)); }
public IdentificationFeatures extractFeatures() { return(IdentificationFeatureExtractionUtil.extractFeatures(accession, peptideScores)); }