public static DataRow extractFeatureVector(String accession, List <PeptideScore> peptideScores) { IdentificationFeatures idf = extractFeatures(accession, peptideScores); int cardinality = idf.getCardinality(); Double highestConfidenceScore = idf.getHighestConfidenceScore(); Double meanConfidenceScore = idf.getMeanConfidenceScore(); Double medianConfidenceScore = idf.getMedianConfidenceScore(); Double highestDCN = idf.getHighestDCN(); Double meanDCN = idf.getMeanDCN(); Double medianDCN = idf.getMedianDCN(); #if STDEVINCLUDED Double stdevConfidenceScore = idf.getStdevConfidenceScore(); if (cardinality == 0 || cardinality == 1) { stdevConfidenceScore = DEFAULT_STDEV_MAX; //stdevConfidenceScore = max_stdev; //if ((int)Math.Round(stdevConfidenceScore) == 0) //{ // stdevConfidenceScore = DEFAULT_STDEV_MAX; //} } DataRow r = CreateRow(accession, cardinality, highestConfidenceScore, meanConfidenceScore, medianConfidenceScore, highestDCN, meanDCN, medianDCN, stdevConfidenceScore); #else DataRow r = CreateRow(accession, cardinality, highestConfidenceScore, meanConfidenceScore, medianConfidenceScore, highestDCN, meanDCN, medianDCN); #endif return(r); }
/* * Write the identification features used for training the logistic regression * classifier */ public static void WriteIdentificationFeaturesFile(String file_path, List <IdentificationFeatures> positiveTrainingSet, List <IdentificationFeatures> negativeTrainingSet) { log.Debug("Writing Identification Features to a file..."); try { StreamWriter writer = new StreamWriter(file_path); log.Debug("File name: " + file_path); // Write header TODO remove String header = "label," + IdentificationFeatures.getHeader(); writer.Write(header); // in the first column, 1 indicates positive training set foreach (IdentificationFeatures i in positiveTrainingSet) { writer.Write("\n" + "1," + i.writeToFile()); writer.Flush(); } // in the first column, 0 indicates negative training set foreach (IdentificationFeatures i in negativeTrainingSet) { writer.Write("\n" + "0," + i.writeToFile()); writer.Flush(); } writer.Flush(); writer.Close(); } catch (Exception e) { Console.WriteLine(e.ToString()); log.Error("Writing file unsuccessful!!!"); Console.ReadKey(); Environment.Exit(0); } log.Debug("Writing file successful."); }
public static IdentificationFeatures extractFeatures(String accession, List <PeptideScore> peptideScores) { int cardinality = peptideScores.Count; if (cardinality < 1) { return(new IdentificationFeatures(accession, cardinality, 0, 0, 0, 0, 0, 0, 0)); } //TODO should this be number of unique peptides or number of peptide scores?? Double highestConfidenceScore = Double.MinValue; Double meanConfidenceScore; Double medianConfidenceScore; Double stdevConfidenceScore; Double highestDCN = Double.MinValue; Double meanDCN; Double medianDCN; double[] scores = new double[cardinality]; double[] dCNList = new double[cardinality]; for (int i = 0; i < peptideScores.Count; i++) { PeptideScore s = peptideScores[i]; Double confidenceScore = s.getXCorr(); Double dCN = s.getdCN(); scores[i] = confidenceScore; dCNList[i] = dCN; // calculate highest confidence score if (confidenceScore > highestConfidenceScore) { highestConfidenceScore = confidenceScore; } if (dCN > highestDCN) { highestDCN = dCN; } } // calculate mean meanConfidenceScore = scores.Average(); meanDCN = dCNList.Average(); // calculate median medianConfidenceScore = CalculateMedian(scores); medianDCN = CalculateMedian(dCNList); // calculate stdev // bias correction set to true // that means the stdev formula uses "N-1" as the denominator, I believe. // this helps to estimate the variance more accurately for a small N stdevConfidenceScore = CalculateStdDev(scores); IdentificationFeatures f = new IdentificationFeatures(accession, cardinality, highestConfidenceScore, meanConfidenceScore, medianConfidenceScore, highestDCN, meanDCN, medianDCN, stdevConfidenceScore); return(f); }