Beispiel #1
0
        public static IFeatureSynthesizer <string> deriveOptimalClassifier()
        {
            //Load databases
            DiscreteSeriesDatabase <string> allData = LoadRegionsDatabase();

            Tuple <DiscreteSeriesDatabase <string>, DiscreteSeriesDatabase <string> > split = allData.SplitDatabase(.8);

            DiscreteSeriesDatabase <string> trainingData = split.Item1;
            DiscreteSeriesDatabase <string> testData     = split.Item2;

            string cat = "region";

            double optimalScore = 0;
            IFeatureSynthesizer <string> optimalClassifier = null;
            string optimalInfoStr = null;

            //Preliminary scan

            int[] ks = new int[] { 2, 3, 4 };
            //int[] minCutoffs = new int[]{5, 10, 20};
            int[] minCutoffs       = new int[] { 10 };
            int[] kmerCounts       = new int[] { 10, 25, 50, 100 };
            int[] smoothingAmounts = new int[] { 1, 5, 10 };

            string[] colNames = "k minCutoff kmerCount smoothingAmount score".Split(' ');

            Console.WriteLine(colNames.FoldToString("", "", ","));

            foreach (int k in ks)
            {
                foreach (int minCutoff in minCutoffs)
                {
                    foreach (int kmerCount in kmerCounts)
                    {
                        foreach (int smoothingAmount in smoothingAmounts)
                        {
                            IFeatureSynthesizer <string> classifier = new RegressorFeatureSynthesizerKmerFrequenciesVarK <string>(cat, minCutoff, smoothingAmount, kmerCount, k);
                            classifier.Train(trainingData);

                            double score = classifier.ScoreModel(testData);

                            string infoStr = new double[] { k, minCutoff, kmerCount, smoothingAmount, score }.FoldToString("", "", ",");

                            Console.WriteLine(infoStr);
                            if (score > optimalScore)
                            {
                                optimalScore      = score;
                                optimalClassifier = classifier;
                                optimalInfoStr    = infoStr;
                            }
                        }
                    }
                }
            }

            Console.WriteLine("Optimal Classifier:");
            Console.WriteLine(optimalInfoStr);
            Console.WriteLine(optimalClassifier);

            return(optimalClassifier);
        }
Beispiel #2
0
        public ClassifierAccuracyAnalysis <Ty> runAccuracyAnalysis()
        {
            string nameCriterion = "filename";             //TODO: Input parameter or constant for this.

            //Filter out items not labeled for this criterion.
            labeledData = labeledData.FilterForCriterion(criterionByWhichToClassify);

            //
            datasetSchema = labeledData.getLabelClasses(criterionByWhichToClassify).Order().ToArray();

            //Create mapping from strings to indices
            schemaMapping = datasetSchema.IndexLookupDictionary();

            bucketCount = (int)(1.0 / bucketSize);
            classCount  = datasetSchema.Length;

            //Raw data classifications:
            //Name, true class, predicted class, scores, winning score;
            classificationInstances = new List <Tuple <string, string, string, double[], double> > ();
            if (testOverfitting)
            {
                trainingDataClassificationInstances = new List <Tuple <string, string, string, double[], double> > ();
            }

            string classifierName = "\"" + AlgorithmReflectionExtensions.GetAlgorithmName(classifier) + "\"";

            Console.WriteLine("Running classifier " + classifierName + " on " + labeledData.data.Count + " items.");

            //TODO: Not a bad idea to duplicate the classifiers to increase parallelism.

            //Run and make classifiers.
            for (int i = 0; i < iterations; i++)
            {
                Console.WriteLine("Classifier Accuracy: Initiating round " + (i + 1) + " / " + iterations + " for " + classifierName + ".");

                Tuple <DiscreteSeriesDatabase <Ty>, DiscreteSeriesDatabase <Ty> > split = labeledData.SplitDatabase(trainSplitFrac);              //TODO: Vary this?
                DiscreteSeriesDatabase <Ty> training = split.Item1;
                DiscreteSeriesDatabase <Ty> test     = split.Item2;

                classifier.Train(training);

                string[] classifierSchema = classifier.GetClasses();

                classificationInstances.AddRange(test.data.AsParallel().WithExecutionMode(ParallelExecutionMode.ForceParallelism).Select(item => classificationInfo(classifier, classifierSchema, schemaMapping, item, nameCriterion, criterionByWhichToClassify)));
                if (testOverfitting)
                {
                    trainingDataClassificationInstances.AddRange(training.data.Take((int)(overfittingTestFrac * training.data.Count)).AsParallel().WithExecutionMode(ParallelExecutionMode.ForceParallelism).Select(item => classificationInfo(classifier, classifierSchema, schemaMapping, item, nameCriterion, criterionByWhichToClassify)));
                }
            }


            //Confusion Matrices.
            confusionMatrixCounts = new int[classCount, classCount];             // [a,b] : How often a is classified as b
            confusionMatrixScores = new double[classCount, classCount];

            //Confusion matrix allocation
            countsConfusionMatricesByConfidence = new int[bucketCount][, ];
            scoresConfusionMatricesByConfidence = new double[bucketCount][, ];

            for (int i = 0; i < bucketCount; i++)
            {
                countsConfusionMatricesByConfidence [i] = new int[classCount, classCount];
                scoresConfusionMatricesByConfidence [i] = new double[classCount, classCount];
            }

            //Confusion Matrix population
            foreach (var classification in classificationInstances)
            {
                int confidenceBucket = Math.Min((int)Math.Floor(classification.Item5 * bucketCount), bucketCount - 1);                   //On a score of 1 or greater, clip to the top bucket.  Highest confidence is always positive because confidences sum to 1.
                //Counts
                confusionMatrixCounts [schemaMapping [classification.Item2], schemaMapping [classification.Item3]]++;
                countsConfusionMatricesByConfidence [confidenceBucket] [schemaMapping [classification.Item2], schemaMapping [classification.Item3]]++;

                //Scores
                for (int j = 0; j < classCount; j++)
                {
                    confusionMatrixScores [schemaMapping [classification.Item2], j] += classification.Item4 [j];
                    scoresConfusionMatricesByConfidence [confidenceBucket] [schemaMapping [classification.Item2], j] += classification.Item4 [j];
                }
            }


            //Aggregates

            countColumnSums = Enumerable.Range(0, classCount).Select(i => (double)confusionMatrixCounts.SumColumn(i)).ToArray();
            countRowSums    = Enumerable.Range(0, classCount).Select(i => (double)confusionMatrixCounts.SumRow(i)).ToArray();
            scoreColumnSums = Enumerable.Range(0, classCount).Select(i => confusionMatrixScores.SumColumn(i)).ToArray();
            scoreRowSums    = Enumerable.Range(0, classCount).Select(i => confusionMatrixScores.SumRow(i)).ToArray();

            classCountAccuracies      = Enumerable.Range(0, classCount).Select(c => confusionMatrixCounts [c, c] / (double)countRowSums [c]).ToArray();
            overallAccuracy           = Enumerable.Range(0, classCount).Select(i => (double)confusionMatrixCounts [i, i]).Sum() / classificationInstances.Count;
            expectedAccuracyRandom    = (1.0 / classCount);
            topClassSelectionAccuracy = labeledData.GroupBy(item => item.labels[criterionByWhichToClassify]).Select(grp => grp.Count()).Max() / (double)labeledData.data.Count;

            //Safety check.
            {
                double countSum = countColumnSums.Sum();
                double scoreSum = scoreColumnSums.Sum();

                //These should all be the same ass instancesClassifiedCount, to within numerics errors.
                Trace.Assert(Math.Abs(countSum - classificationInstances.Count) < .00001);
                Trace.Assert(Math.Abs(scoreSum - classificationInstances.Count) < .00001);
            }

            //Class, Confidence Bucket
            accuracyByTrueClassAndConfidence      = new double[classCount + 1, bucketCount];
            accuracyByPredictedClassAndConfidence = new double[classCount + 1, bucketCount];

            for (int i = 0; i < bucketCount; i++)
            {
                for (int j = 0; j < classCount; j++)
                {
                    accuracyByTrueClassAndConfidence [j + 1, i]      = (double)countsConfusionMatricesByConfidence [i] [j, j] / (double)countsConfusionMatricesByConfidence [i].SumRow(j);
                    accuracyByPredictedClassAndConfidence [j + 1, i] = (double)countsConfusionMatricesByConfidence [i] [j, j] / (double)countsConfusionMatricesByConfidence [i].SumColumn(j);
                }

                //TODO: Never use a try catch block, and punish those who do.
                try {
                    accuracyByTrueClassAndConfidence [0, i] = Enumerable.Range(1, classCount).Select(j => accuracyByTrueClassAndConfidence [j, i]).Where(val => !Double.IsNaN(val)).Average();
                } catch {
                    accuracyByTrueClassAndConfidence [0, i] = Double.NaN;
                }
                try {
                    accuracyByPredictedClassAndConfidence [0, i] = Enumerable.Range(1, classCount).Select(j => accuracyByTrueClassAndConfidence [j, i]).Where(val => !Double.IsNaN(val)).Average();
                } catch {
                    accuracyByPredictedClassAndConfidence [0, i] = Double.NaN;
                }
            }

            //For use in math mode elements and matrices.
            datasetSchemaText        = datasetSchema.Select(item => @"\text{" + LatexExtensions.limitLength(item, 15) + "}").ToArray();
            datasetSchemaTextRotated = datasetSchemaText.Select(item => @"\begin{turn}{70} " + item + @" \end{turn}").ToArray();

            //TODO: Limiting length could cause duplication


            return(this);
        }
Beispiel #3
0
        public static void TestNewDesign()
        {
            DiscreteSeriesDatabase <string> allData = LoadRegionsDatabase();

            Tuple <DiscreteSeriesDatabase <string>, DiscreteSeriesDatabase <string> > split = allData.SplitDatabase(.8);

            DiscreteSeriesDatabase <string> trainingData = split.Item1;
            DiscreteSeriesDatabase <string> testData     = split.Item2;


            IFeatureSynthesizer <string> synth = new RegressorFeatureSynthesizerKmerFrequenciesVarK <string>("region", 8, 2, 100, 3);

            //IFeatureSynthesizer<string> synth = new RegressorFeatureSynthesizerKmerFrequencies<string>("region", 4, 10, 100, 3);
            //IFeatureSynthesizer<string> synth = new RegressorFeatureSynthesizerFrequencies<string>("region", 4, 10, 100);

            synth.Train(trainingData);

            Console.WriteLine(synth.ToString());
            synth.ScoreModel(testData, 2, "filename");
            Console.WriteLine(ClassifyDataSet(synth, testData, "filename"));              //TODO may be good to use something unspecifiable in the file syntax such as "filename;"


            //Console.WriteLine (allData.DatabaseLatexString("Regional Spanish Database"));
        }