public static IFeatureSynthesizer <string> deriveOptimalClassifier() { //Load databases DiscreteSeriesDatabase <string> allData = LoadRegionsDatabase(); Tuple <DiscreteSeriesDatabase <string>, DiscreteSeriesDatabase <string> > split = allData.SplitDatabase(.8); DiscreteSeriesDatabase <string> trainingData = split.Item1; DiscreteSeriesDatabase <string> testData = split.Item2; string cat = "region"; double optimalScore = 0; IFeatureSynthesizer <string> optimalClassifier = null; string optimalInfoStr = null; //Preliminary scan int[] ks = new int[] { 2, 3, 4 }; //int[] minCutoffs = new int[]{5, 10, 20}; int[] minCutoffs = new int[] { 10 }; int[] kmerCounts = new int[] { 10, 25, 50, 100 }; int[] smoothingAmounts = new int[] { 1, 5, 10 }; string[] colNames = "k minCutoff kmerCount smoothingAmount score".Split(' '); Console.WriteLine(colNames.FoldToString("", "", ",")); foreach (int k in ks) { foreach (int minCutoff in minCutoffs) { foreach (int kmerCount in kmerCounts) { foreach (int smoothingAmount in smoothingAmounts) { IFeatureSynthesizer <string> classifier = new RegressorFeatureSynthesizerKmerFrequenciesVarK <string>(cat, minCutoff, smoothingAmount, kmerCount, k); classifier.Train(trainingData); double score = classifier.ScoreModel(testData); string infoStr = new double[] { k, minCutoff, kmerCount, smoothingAmount, score }.FoldToString("", "", ","); Console.WriteLine(infoStr); if (score > optimalScore) { optimalScore = score; optimalClassifier = classifier; optimalInfoStr = infoStr; } } } } } Console.WriteLine("Optimal Classifier:"); Console.WriteLine(optimalInfoStr); Console.WriteLine(optimalClassifier); return(optimalClassifier); }
public ClassifierAccuracyAnalysis <Ty> runAccuracyAnalysis() { string nameCriterion = "filename"; //TODO: Input parameter or constant for this. //Filter out items not labeled for this criterion. labeledData = labeledData.FilterForCriterion(criterionByWhichToClassify); // datasetSchema = labeledData.getLabelClasses(criterionByWhichToClassify).Order().ToArray(); //Create mapping from strings to indices schemaMapping = datasetSchema.IndexLookupDictionary(); bucketCount = (int)(1.0 / bucketSize); classCount = datasetSchema.Length; //Raw data classifications: //Name, true class, predicted class, scores, winning score; classificationInstances = new List <Tuple <string, string, string, double[], double> > (); if (testOverfitting) { trainingDataClassificationInstances = new List <Tuple <string, string, string, double[], double> > (); } string classifierName = "\"" + AlgorithmReflectionExtensions.GetAlgorithmName(classifier) + "\""; Console.WriteLine("Running classifier " + classifierName + " on " + labeledData.data.Count + " items."); //TODO: Not a bad idea to duplicate the classifiers to increase parallelism. //Run and make classifiers. for (int i = 0; i < iterations; i++) { Console.WriteLine("Classifier Accuracy: Initiating round " + (i + 1) + " / " + iterations + " for " + classifierName + "."); Tuple <DiscreteSeriesDatabase <Ty>, DiscreteSeriesDatabase <Ty> > split = labeledData.SplitDatabase(trainSplitFrac); //TODO: Vary this? DiscreteSeriesDatabase <Ty> training = split.Item1; DiscreteSeriesDatabase <Ty> test = split.Item2; classifier.Train(training); string[] classifierSchema = classifier.GetClasses(); classificationInstances.AddRange(test.data.AsParallel().WithExecutionMode(ParallelExecutionMode.ForceParallelism).Select(item => classificationInfo(classifier, classifierSchema, schemaMapping, item, nameCriterion, criterionByWhichToClassify))); if (testOverfitting) { trainingDataClassificationInstances.AddRange(training.data.Take((int)(overfittingTestFrac * training.data.Count)).AsParallel().WithExecutionMode(ParallelExecutionMode.ForceParallelism).Select(item => classificationInfo(classifier, classifierSchema, schemaMapping, item, nameCriterion, criterionByWhichToClassify))); } } //Confusion Matrices. confusionMatrixCounts = new int[classCount, classCount]; // [a,b] : How often a is classified as b confusionMatrixScores = new double[classCount, classCount]; //Confusion matrix allocation countsConfusionMatricesByConfidence = new int[bucketCount][, ]; scoresConfusionMatricesByConfidence = new double[bucketCount][, ]; for (int i = 0; i < bucketCount; i++) { countsConfusionMatricesByConfidence [i] = new int[classCount, classCount]; scoresConfusionMatricesByConfidence [i] = new double[classCount, classCount]; } //Confusion Matrix population foreach (var classification in classificationInstances) { int confidenceBucket = Math.Min((int)Math.Floor(classification.Item5 * bucketCount), bucketCount - 1); //On a score of 1 or greater, clip to the top bucket. Highest confidence is always positive because confidences sum to 1. //Counts confusionMatrixCounts [schemaMapping [classification.Item2], schemaMapping [classification.Item3]]++; countsConfusionMatricesByConfidence [confidenceBucket] [schemaMapping [classification.Item2], schemaMapping [classification.Item3]]++; //Scores for (int j = 0; j < classCount; j++) { confusionMatrixScores [schemaMapping [classification.Item2], j] += classification.Item4 [j]; scoresConfusionMatricesByConfidence [confidenceBucket] [schemaMapping [classification.Item2], j] += classification.Item4 [j]; } } //Aggregates countColumnSums = Enumerable.Range(0, classCount).Select(i => (double)confusionMatrixCounts.SumColumn(i)).ToArray(); countRowSums = Enumerable.Range(0, classCount).Select(i => (double)confusionMatrixCounts.SumRow(i)).ToArray(); scoreColumnSums = Enumerable.Range(0, classCount).Select(i => confusionMatrixScores.SumColumn(i)).ToArray(); scoreRowSums = Enumerable.Range(0, classCount).Select(i => confusionMatrixScores.SumRow(i)).ToArray(); classCountAccuracies = Enumerable.Range(0, classCount).Select(c => confusionMatrixCounts [c, c] / (double)countRowSums [c]).ToArray(); overallAccuracy = Enumerable.Range(0, classCount).Select(i => (double)confusionMatrixCounts [i, i]).Sum() / classificationInstances.Count; expectedAccuracyRandom = (1.0 / classCount); topClassSelectionAccuracy = labeledData.GroupBy(item => item.labels[criterionByWhichToClassify]).Select(grp => grp.Count()).Max() / (double)labeledData.data.Count; //Safety check. { double countSum = countColumnSums.Sum(); double scoreSum = scoreColumnSums.Sum(); //These should all be the same ass instancesClassifiedCount, to within numerics errors. Trace.Assert(Math.Abs(countSum - classificationInstances.Count) < .00001); Trace.Assert(Math.Abs(scoreSum - classificationInstances.Count) < .00001); } //Class, Confidence Bucket accuracyByTrueClassAndConfidence = new double[classCount + 1, bucketCount]; accuracyByPredictedClassAndConfidence = new double[classCount + 1, bucketCount]; for (int i = 0; i < bucketCount; i++) { for (int j = 0; j < classCount; j++) { accuracyByTrueClassAndConfidence [j + 1, i] = (double)countsConfusionMatricesByConfidence [i] [j, j] / (double)countsConfusionMatricesByConfidence [i].SumRow(j); accuracyByPredictedClassAndConfidence [j + 1, i] = (double)countsConfusionMatricesByConfidence [i] [j, j] / (double)countsConfusionMatricesByConfidence [i].SumColumn(j); } //TODO: Never use a try catch block, and punish those who do. try { accuracyByTrueClassAndConfidence [0, i] = Enumerable.Range(1, classCount).Select(j => accuracyByTrueClassAndConfidence [j, i]).Where(val => !Double.IsNaN(val)).Average(); } catch { accuracyByTrueClassAndConfidence [0, i] = Double.NaN; } try { accuracyByPredictedClassAndConfidence [0, i] = Enumerable.Range(1, classCount).Select(j => accuracyByTrueClassAndConfidence [j, i]).Where(val => !Double.IsNaN(val)).Average(); } catch { accuracyByPredictedClassAndConfidence [0, i] = Double.NaN; } } //For use in math mode elements and matrices. datasetSchemaText = datasetSchema.Select(item => @"\text{" + LatexExtensions.limitLength(item, 15) + "}").ToArray(); datasetSchemaTextRotated = datasetSchemaText.Select(item => @"\begin{turn}{70} " + item + @" \end{turn}").ToArray(); //TODO: Limiting length could cause duplication return(this); }
public static void TestNewDesign() { DiscreteSeriesDatabase <string> allData = LoadRegionsDatabase(); Tuple <DiscreteSeriesDatabase <string>, DiscreteSeriesDatabase <string> > split = allData.SplitDatabase(.8); DiscreteSeriesDatabase <string> trainingData = split.Item1; DiscreteSeriesDatabase <string> testData = split.Item2; IFeatureSynthesizer <string> synth = new RegressorFeatureSynthesizerKmerFrequenciesVarK <string>("region", 8, 2, 100, 3); //IFeatureSynthesizer<string> synth = new RegressorFeatureSynthesizerKmerFrequencies<string>("region", 4, 10, 100, 3); //IFeatureSynthesizer<string> synth = new RegressorFeatureSynthesizerFrequencies<string>("region", 4, 10, 100); synth.Train(trainingData); Console.WriteLine(synth.ToString()); synth.ScoreModel(testData, 2, "filename"); Console.WriteLine(ClassifyDataSet(synth, testData, "filename")); //TODO may be good to use something unspecifiable in the file syntax such as "filename;" //Console.WriteLine (allData.DatabaseLatexString("Regional Spanish Database")); }