예제 #1
0
        public void Train(DiscreteSeriesDatabase <Ty> series)
        {
            //TODO: Sharing this data like this may be detrimental.
            //TODO: Boolean for whether the synthesizer needs to be trained.
            synthesizer.Train(series);
            classifier.Train(series                                                                                                                                 //.AsParallel().AsOrdered() //Parallel causes a bug where not all of the items are always reached.
                             .Where(item => item.labels.ContainsKey(synthesizer.ClassificationCriterion))
                             .Select(item => new LabeledInstance(item.labels[synthesizer.ClassificationCriterion], synthesizer.SynthesizeFeatures(item))).ToArray() //But this seems to fix it?
                             );

            //There is a bug, this code tests for a failure.


            if (classifier is NullProbabalisticClassifier)
            {
                string[] synthFeatures     = synthesizer.GetFeatureSchema();
                string[] classifierClasses = classifier.GetClasses();

                if (synthFeatures.Length != classifierClasses.Length || !synthFeatures.Zip(classifierClasses, (s, c) => (s == c)).Conjunction())
                {
                    Console.WriteLine("A catastrophic error has occured in the Null Probabalistic Classifier.  Feature schema:");
                    Console.WriteLine(synthFeatures.FoldToString());
                    Console.WriteLine("But classifier (NullProbabalisticClassifier):");
                    Console.WriteLine(classifierClasses.FoldToString());
                    Console.WriteLine("Training Names:");
                    string[] trainingNames = series.Where(item => item.labels.ContainsKey(synthesizer.ClassificationCriterion)).Select(item => item.labels[synthesizer.ClassificationCriterion]).Distinct().Order().ToArray();
                    Console.WriteLine(trainingNames.FoldToString());
                    Console.WriteLine("synthesizer, classifier, training: " + synthFeatures.Length + ", " + classifierClasses.Length + ", " + trainingNames.Length);
                    Console.WriteLine("All Training Data:");
                    Console.WriteLine(series.FoldToString(item => item.labels.GetWithDefault(synthesizer.ClassificationCriterion, "[none]")));
                    Console.Write("");
                }
            }
        }
예제 #2
0
 //Train an IFeatureSynthesizer model.
 //This function shall be called before SynthesizeFeatures iff NeedsTraining
 public void Train(DiscreteSeriesDatabase <Ty> data)
 {
     foreach (IFeatureSynthesizer <Ty> synth in synths)
     {
         if (synth.NeedsTraining)
         {
             synth.Train(data);
         }
     }
 }
예제 #3
0
 public ClassifierAccuracyAnalysis(IEventSeriesProbabalisticClassifier <Ty> classifier, string classifierName, DiscreteSeriesDatabase <Ty> labeledData, string criterionByWhichToClassify, double trainSplitFrac, int iterations, double bucketSize)
 {
     this.classifier                 = classifier;
     this.classifierName             = classifierName;
     this.labeledData                = labeledData;
     this.criterionByWhichToClassify = criterionByWhichToClassify;
     this.trainSplitFrac             = trainSplitFrac;
     this.iterations                 = iterations;
     this.bucketSize                 = bucketSize;
 }
예제 #4
0
        public static void runNewsClassifierDerivation(string inFile, string outDirectory, int count, int iterations)
        {
            //Load the database:
            DiscreteSeriesDatabase <string> data = getNewsDataset(inFile, count);
            //data = data.SplitDatabase (.1).Item1;


            IEnumerable <Tuple <string, IEventSeriesProbabalisticClassifier <string> > > classifiers = TextClassifierFactory.NewsTestClassifiers().Concat(TextClassifierFactory.NewsTestAdvancedClassifiers().Skip(1));
            IFeatureSynthesizer <string> synth = new CompoundFeatureSynthesizer <string>("author", new IFeatureSynthesizer <string>[] {
                new VarKmerFrequencyFeatureSynthesizer <string>("author", 3, 2, 60, 0.7, false),
                new VarKmerFrequencyFeatureSynthesizer <string>("location", 3, 3, 50, 1, false),
                new VarKmerFrequencyFeatureSynthesizer <string>("gender", 3, 8, 50, 10, false),
                new DateValueFeatureSynthesizer("date"),
                new LatinLanguageFeatureSynthesizer("author")
            });

            WriteupGenerator.ProduceClassifierComparisonWriteup <string>("Classifier Comparison Analysis on Ekantipur News Articles", "Cyrus Cousins with Shirish Pokharel", 20, 20, outDirectory, classifiers.ToArray(), "News", data, "author", iterations, new[] { "author", "location", "date", "gender" }, synth);
        }
예제 #5
0
        protected override IEventSeriesScalarRegressor <Ty>[] CreateRegressors(DiscreteSeriesDatabase <Ty> data)
        {
            //Partition into class and classless groups.
            Tuple <IEnumerable <DiscreteEventSeries <Ty> >, IEnumerable <DiscreteEventSeries <Ty> > > partitioned = data.data.Partition(item => item.labels.ContainsKey(ClassificationCriterion));
            IEnumerable <DiscreteEventSeries <Ty> > noClass = partitioned.Item2;          //This item does not have a class over the category label for which the feature synthezer is being created.

            IEnumerable <DiscreteEventSeries <Ty> > inClass = partitioned.Item1;

            IEnumerable <IGrouping <string, DiscreteEventSeries <Ty> > > groupings = inClass.GroupBy(item => item.labels[ClassificationCriterion]);

            //Establish multisets for each class

            IEnumerable <Tuple <string, Multiset <Ty> > > classSets = groupings.Map(grp => Tuple.Create(grp.Key, grp.ToMultiset())).ToArray();           //Used twice.  Make it an array.

            //Establish the baseline (all data)
            Multiset <Ty> baseline = noClass.ToMultiset().Cons(classSets.Select(a => a.Item2)).MultisetUnion();

            return(classSets.Map(ntp => new ItemFrequencyRegressor <Ty>(ntp.Item1, minSignificantCount, smoothingAmount, featuresToUse, baseline, ntp.Item2)).ToArray());
        }
예제 #6
0
        //Train an IFeatureSynthesizer model.
        public void Train(DiscreteSeriesDatabase <Ty> trainingData)
        {
            //Partition into class and classless groups.
            Tuple <IEnumerable <DiscreteEventSeries <Ty> >, IEnumerable <DiscreteEventSeries <Ty> > > partitioned = trainingData.data.Partition(item => item.labels.ContainsKey(ClassificationCriterion));
            IEnumerable <DiscreteEventSeries <Ty> > classedSeries   = partitioned.Item1;
            IEnumerable <DiscreteEventSeries <Ty> > classlessSeries = partitioned.Item2;          //These items does not have a class over the category label for which the feature synthezer is being created.

            TupleStruct <string, MultisetKmer <Ty> >[] classes = classedSeries.AsParallel()
                                                                 .GroupBy(series => series.labels[ClassificationCriterion])                                                                                       //Group by class
                                                                 .Select(grp => new TupleStruct <string, MultisetKmer <Ty> >(grp.Key, ((IEnumerable <DiscreteEventSeries <Ty> >)grp).ToMultisetVarKmer <Ty> (k))) //Make classes into a single multiset each.
                                                                 .OrderBy(tup => tup.Item1)                                                                                                                       //Sort by name
                                                                 .ToArray();

            /*
             * Console.WriteLine("GROUPS");
             * foreach(var v in classes){
             *      Console.WriteLine ("Class " + v.Item1 + " size " + v.Item2.Size ());
             * }
             */


            MultisetKmer <Ty> baseline;

            if (useUncategorizedForBaseline)
            {
                baseline = classlessSeries.ToMultisetVarKmer(k).Cons(classes.Select(@class => @class.Item2)).MultisetKmerUnion();                    //TODO reuse the classless multiset.
            }
            else
            {
                baseline = classedSeries.ToMultisetVarKmer(k);
            }

            //We now have data for all classes and the baseline.

            IEnumerable <TupleStruct <Kmer <Ty>, double> > characteristicKmers = Enumerable.Range(0, classes.Length).AsParallel().SelectMany(index => ExtractCharacteristicKmersForClass(index, classes[index].Item2, baseline));


            //Lookup for all kmers.

            kmersOntoIndex = characteristicKmers.OrderByDescending(item => item.Item2).Select(item => item.Item1).Distinct().Take((int)kmersToTake).IndexLookupDictionary();
            kmerCount      = kmersOntoIndex.Count;
        }
예제 #7
0
        public static void runNewsClassification(string inFile, string outDirectory, int count, int iterations)
        {
            DiscreteSeriesDatabase <string> data = getNewsDataset(inFile, count);

            //Create the classifier

            /*
             * IEventSeriesProbabalisticClassifier<string> classifier = new SeriesFeatureSynthesizerToVectorProbabalisticClassifierEventSeriesProbabalisticClassifier<string>(
             *      new VarKmerFrequencyFeatureSynthesizer<string>("author", 3, 2, 60, 0.1, false),
             *      new NullProbabalisticClassifier()
             * );
             */

            IEventSeriesProbabalisticClassifier <string> classifier = new SeriesFeatureSynthesizerToVectorProbabalisticClassifierEventSeriesProbabalisticClassifier <string>(
                new VarKmerFrequencyFeatureSynthesizer <string>("author", 3, 2, 50, 0.6, false),
                new PerceptronCloud(16.0, PerceptronTrainingMode.TRAIN_ALL_DATA, PerceptronClassificationMode.USE_NEGATIVES | PerceptronClassificationMode.USE_SCORES, 1.5, false)
                );

            //string documentTitle, string author, int width, int height, string outFile, IEventSeriesProbabalisticClassifier<Ty> classifier, DiscreteEventSeries<Ty> dataset, string datasetTitle, string criterionByWhichToClassify
            WriteupGenerator.ProduceClassificationReport <string>("Analysis and Classification of " + data.data.Count + " Ekantipur Articles", "Cyrus Cousins with Shirish Pokharel", 20, 20, outDirectory, classifier, "characteristic kmer classifier", data, "News", "author", iterations);
        }
예제 #8
0
        public static double ScoreModel <Ty> (this IFeatureSynthesizer <Ty> synth, DiscreteSeriesDatabase <Ty> testData, int verbosity, string nameCategory = null)
        {
            Dictionary <string, int> classRanks = synth.GetFeatureSchema().Select((item, index) => new Tuple <string, int> (item, index)).ToDictionary(a => a.Item1, a => a.Item2);

            //Display schema
            if (verbosity >= 2)
            {
                Console.WriteLine(synth.GetFeatureSchema().FoldToString());
            }

            double score = testData.data.AsParallel()
                           .Where(item => classRanks.ContainsKey(item.labels.GetWithDefault(synth.ClassificationCriterion, ""))) //Filter for items for which we have regressors for.
                           .Select(i => ScoreModelSingle(synth, classRanks, i, verbosity, nameCategory)).Average();              //Score them and take the average.

            if (verbosity >= 2)
            {
                Console.WriteLine("Total Score = " + score);
                Console.WriteLine("E[random model score] = " + (1.0 / classRanks.Count));
            }

            return(score);
        }
예제 #9
0
        public static void LoadTextDatabase(this DiscreteSeriesDatabase <string> db, string directory, TextReader inStream, Func <string, string> textProcessor, int logLevel = 0)         //TODO make this an extension method for DiscreteSeries<string> ?
        {
            List <DiscreteEventSeries <string> > fileData = db.data;

            //Read the file in
            string s = inStream.ReadToEnd();

            string[] entries = s.Split(newLine, StringSplitOptions.RemoveEmptyEntries);

            //TODO: Zip archive version of the following.

            //Read each file
            List <DiscreteEventSeries <string> > items = entries.AsParallel().Select(entry => {
                Dictionary <string, string> entryDict = processEntryLine(entry, logLevel);
                return(processEntryFromFile(directory, entryDict, logLevel, textProcessor));
            }).Where(entry => entry != null).ToList();

            fileData.AddRange(items);

            //Print some information about what has been read.
            if (logLevel >= 1)
            {
                Console.WriteLine("Loaded " + items.Count + " / " + entries.Length + " discrete event series.  " + items.TotalItemCount() + " total words added.");

                if (logLevel >= 3)
                {
                    IEnumerable <string> categoryKeys = items.SelectMany(item => item.labels.Keys).Distinct().Where(item => item != "filename");
                    foreach (string key in categoryKeys)
                    {
                        Console.WriteLine("Classification Criterion: " + key);
                        Console.WriteLine(items.GroupBy(item => item.labels.GetWithDefault(key, "[none]"))                                                                   //Group by category
                                          .FoldToString(item => item.Key + " (" + item.Select(subitem => subitem.data.Length).Sum() + " words): "                            //Count words per category
                                                        + item.FoldToString(subitem => subitem.labels["filename"] + " (" + subitem.data.Length + " words)"), "", "", "\n")); //Show each item in category.
                    }
                }
            }
        }
예제 #10
0
        public static void TestNewDesign()
        {
            DiscreteSeriesDatabase <string> allData = LoadRegionsDatabase();

            Tuple <DiscreteSeriesDatabase <string>, DiscreteSeriesDatabase <string> > split = allData.SplitDatabase(.8);

            DiscreteSeriesDatabase <string> trainingData = split.Item1;
            DiscreteSeriesDatabase <string> testData     = split.Item2;


            IFeatureSynthesizer <string> synth = new RegressorFeatureSynthesizerKmerFrequenciesVarK <string>("region", 8, 2, 100, 3);

            //IFeatureSynthesizer<string> synth = new RegressorFeatureSynthesizerKmerFrequencies<string>("region", 4, 10, 100, 3);
            //IFeatureSynthesizer<string> synth = new RegressorFeatureSynthesizerFrequencies<string>("region", 4, 10, 100);

            synth.Train(trainingData);

            Console.WriteLine(synth.ToString());
            synth.ScoreModel(testData, 2, "filename");
            Console.WriteLine(ClassifyDataSet(synth, testData, "filename"));              //TODO may be good to use something unspecifiable in the file syntax such as "filename;"


            //Console.WriteLine (allData.DatabaseLatexString("Regional Spanish Database"));
        }
예제 #11
0
 public void Train(DiscreteSeriesDatabase <string> data)
 {
     throw new Exception("Cannot train a TextFeatureSynthesizer.");
 }
예제 #12
0
        public ClassifierAccuracyAnalysis <Ty> runAccuracyAnalysis()
        {
            string nameCriterion = "filename";             //TODO: Input parameter or constant for this.

            //Filter out items not labeled for this criterion.
            labeledData = labeledData.FilterForCriterion(criterionByWhichToClassify);

            //
            datasetSchema = labeledData.getLabelClasses(criterionByWhichToClassify).Order().ToArray();

            //Create mapping from strings to indices
            schemaMapping = datasetSchema.IndexLookupDictionary();

            bucketCount = (int)(1.0 / bucketSize);
            classCount  = datasetSchema.Length;

            //Raw data classifications:
            //Name, true class, predicted class, scores, winning score;
            classificationInstances = new List <Tuple <string, string, string, double[], double> > ();
            if (testOverfitting)
            {
                trainingDataClassificationInstances = new List <Tuple <string, string, string, double[], double> > ();
            }

            string classifierName = "\"" + AlgorithmReflectionExtensions.GetAlgorithmName(classifier) + "\"";

            Console.WriteLine("Running classifier " + classifierName + " on " + labeledData.data.Count + " items.");

            //TODO: Not a bad idea to duplicate the classifiers to increase parallelism.

            //Run and make classifiers.
            for (int i = 0; i < iterations; i++)
            {
                Console.WriteLine("Classifier Accuracy: Initiating round " + (i + 1) + " / " + iterations + " for " + classifierName + ".");

                Tuple <DiscreteSeriesDatabase <Ty>, DiscreteSeriesDatabase <Ty> > split = labeledData.SplitDatabase(trainSplitFrac);              //TODO: Vary this?
                DiscreteSeriesDatabase <Ty> training = split.Item1;
                DiscreteSeriesDatabase <Ty> test     = split.Item2;

                classifier.Train(training);

                string[] classifierSchema = classifier.GetClasses();

                classificationInstances.AddRange(test.data.AsParallel().WithExecutionMode(ParallelExecutionMode.ForceParallelism).Select(item => classificationInfo(classifier, classifierSchema, schemaMapping, item, nameCriterion, criterionByWhichToClassify)));
                if (testOverfitting)
                {
                    trainingDataClassificationInstances.AddRange(training.data.Take((int)(overfittingTestFrac * training.data.Count)).AsParallel().WithExecutionMode(ParallelExecutionMode.ForceParallelism).Select(item => classificationInfo(classifier, classifierSchema, schemaMapping, item, nameCriterion, criterionByWhichToClassify)));
                }
            }


            //Confusion Matrices.
            confusionMatrixCounts = new int[classCount, classCount];             // [a,b] : How often a is classified as b
            confusionMatrixScores = new double[classCount, classCount];

            //Confusion matrix allocation
            countsConfusionMatricesByConfidence = new int[bucketCount][, ];
            scoresConfusionMatricesByConfidence = new double[bucketCount][, ];

            for (int i = 0; i < bucketCount; i++)
            {
                countsConfusionMatricesByConfidence [i] = new int[classCount, classCount];
                scoresConfusionMatricesByConfidence [i] = new double[classCount, classCount];
            }

            //Confusion Matrix population
            foreach (var classification in classificationInstances)
            {
                int confidenceBucket = Math.Min((int)Math.Floor(classification.Item5 * bucketCount), bucketCount - 1);                   //On a score of 1 or greater, clip to the top bucket.  Highest confidence is always positive because confidences sum to 1.
                //Counts
                confusionMatrixCounts [schemaMapping [classification.Item2], schemaMapping [classification.Item3]]++;
                countsConfusionMatricesByConfidence [confidenceBucket] [schemaMapping [classification.Item2], schemaMapping [classification.Item3]]++;

                //Scores
                for (int j = 0; j < classCount; j++)
                {
                    confusionMatrixScores [schemaMapping [classification.Item2], j] += classification.Item4 [j];
                    scoresConfusionMatricesByConfidence [confidenceBucket] [schemaMapping [classification.Item2], j] += classification.Item4 [j];
                }
            }


            //Aggregates

            countColumnSums = Enumerable.Range(0, classCount).Select(i => (double)confusionMatrixCounts.SumColumn(i)).ToArray();
            countRowSums    = Enumerable.Range(0, classCount).Select(i => (double)confusionMatrixCounts.SumRow(i)).ToArray();
            scoreColumnSums = Enumerable.Range(0, classCount).Select(i => confusionMatrixScores.SumColumn(i)).ToArray();
            scoreRowSums    = Enumerable.Range(0, classCount).Select(i => confusionMatrixScores.SumRow(i)).ToArray();

            classCountAccuracies      = Enumerable.Range(0, classCount).Select(c => confusionMatrixCounts [c, c] / (double)countRowSums [c]).ToArray();
            overallAccuracy           = Enumerable.Range(0, classCount).Select(i => (double)confusionMatrixCounts [i, i]).Sum() / classificationInstances.Count;
            expectedAccuracyRandom    = (1.0 / classCount);
            topClassSelectionAccuracy = labeledData.GroupBy(item => item.labels[criterionByWhichToClassify]).Select(grp => grp.Count()).Max() / (double)labeledData.data.Count;

            //Safety check.
            {
                double countSum = countColumnSums.Sum();
                double scoreSum = scoreColumnSums.Sum();

                //These should all be the same ass instancesClassifiedCount, to within numerics errors.
                Trace.Assert(Math.Abs(countSum - classificationInstances.Count) < .00001);
                Trace.Assert(Math.Abs(scoreSum - classificationInstances.Count) < .00001);
            }

            //Class, Confidence Bucket
            accuracyByTrueClassAndConfidence      = new double[classCount + 1, bucketCount];
            accuracyByPredictedClassAndConfidence = new double[classCount + 1, bucketCount];

            for (int i = 0; i < bucketCount; i++)
            {
                for (int j = 0; j < classCount; j++)
                {
                    accuracyByTrueClassAndConfidence [j + 1, i]      = (double)countsConfusionMatricesByConfidence [i] [j, j] / (double)countsConfusionMatricesByConfidence [i].SumRow(j);
                    accuracyByPredictedClassAndConfidence [j + 1, i] = (double)countsConfusionMatricesByConfidence [i] [j, j] / (double)countsConfusionMatricesByConfidence [i].SumColumn(j);
                }

                //TODO: Never use a try catch block, and punish those who do.
                try {
                    accuracyByTrueClassAndConfidence [0, i] = Enumerable.Range(1, classCount).Select(j => accuracyByTrueClassAndConfidence [j, i]).Where(val => !Double.IsNaN(val)).Average();
                } catch {
                    accuracyByTrueClassAndConfidence [0, i] = Double.NaN;
                }
                try {
                    accuracyByPredictedClassAndConfidence [0, i] = Enumerable.Range(1, classCount).Select(j => accuracyByTrueClassAndConfidence [j, i]).Where(val => !Double.IsNaN(val)).Average();
                } catch {
                    accuracyByPredictedClassAndConfidence [0, i] = Double.NaN;
                }
            }

            //For use in math mode elements and matrices.
            datasetSchemaText        = datasetSchema.Select(item => @"\text{" + LatexExtensions.limitLength(item, 15) + "}").ToArray();
            datasetSchemaTextRotated = datasetSchemaText.Select(item => @"\begin{turn}{70} " + item + @" \end{turn}").ToArray();

            //TODO: Limiting length could cause duplication


            return(this);
        }
예제 #13
0
        public static double ScoreModelType <Ty>(IEnumerable <string> categoryLabels, Func <string, IFeatureSynthesizer <Ty> > modelGenerator, DiscreteSeriesDatabase <Ty> trainingData, DiscreteSeriesDatabase <Ty> testData)
        {
            double sumScore = 0;
            int    count    = 0;

            foreach (string categoryLabel in categoryLabels)
            {
                //Train a model for this category label.
                IFeatureSynthesizer <Ty> model = modelGenerator(categoryLabel);
                model.Train(trainingData);
                sumScore += model.ScoreModel(testData);
            }

            return(sumScore / count);
        }
예제 #14
0
 //Score a model.  Value returned on [0, 1], where 1 represents a perfectly accurate model and 0 a completely inaccurate model.
 public static double ScoreModel <Ty> (this IFeatureSynthesizer <Ty> synth, DiscreteSeriesDatabase <Ty> testData)
 {
     return(synth.ScoreModel(testData, 1));
 }
예제 #15
0
        //Train an IFeatureSynthesizer model.
        public void Train(DiscreteSeriesDatabase <Ty> trainingData)
        {
            //Partition into class and classless groups.
            Tuple <IEnumerable <DiscreteEventSeries <Ty> >, IEnumerable <DiscreteEventSeries <Ty> > > partitioned = trainingData.data.Partition(item => item.labels.ContainsKey(ClassificationCriterion));
            IEnumerable <DiscreteEventSeries <Ty> > classedSeries   = partitioned.Item1;
            IEnumerable <DiscreteEventSeries <Ty> > classlessSeries = partitioned.Item2;          //These items does not have a class over the category label for which the feature synthezer is being created.

            TupleStruct <string, MultisetKmer <Ty> >[] classes = classedSeries.AsParallel()
                                                                 .GroupBy(series => series.labels[ClassificationCriterion])                                                                                       //Group by class
                                                                 .Select(grp => new TupleStruct <string, MultisetKmer <Ty> >(grp.Key, ((IEnumerable <DiscreteEventSeries <Ty> >)grp).ToMultisetVarKmer <Ty> (k))) //Make classes into a single multiset each.
                                                                 .OrderBy(tup => tup.Item1)                                                                                                                       //Sort by name
                                                                 .ToArray();

            MultisetKmer <Ty> baseline;

            if (useUncategorizedForBaseline)
            {
                baseline = classlessSeries.ToMultisetVarKmer(k).Cons(classes.Select(@class => @class.Item2)).MultisetKmerUnion();                    //TODO reuse the classless multiset.
            }
            else
            {
                baseline = classedSeries.ToMultisetVarKmer(k);
            }

            //We now have data for all classes and the baseline.

            //Create the data structures.

            classCount = classes.Length;

            //Lookup for all class strings.
            classLookup = classes.Select(tup => tup.Item1).IndexLookupDictionary();

            //Console.WriteLine ("Training.  " + classes.Length + " classes, " + trainingData.data.Count + " instances.");

            /* Not parallelized:
             * foreach(int classIndex in Enumerable.Range (0, classCount)){
             *      //All the kmers found in this class.
             *      TupleStruct<string, MultisetKmer<Ty>> thisClass = classes[classIndex];
             *      List<TupleStruct<Kmer<Ty>, double>> thisClassCharacteristicKmersStore = new List<TupleStruct<Kmer<Ty>, double>>();
             *      foreach(KeyValuePair<Kmer<Ty>, int> kvp in thisClass.Item2){
             *              if(kvp.Value > minKmerCount){
             *                      double thisFreq = kvp.Value / (double) thisClass.Item2.Size (kvp.Key.data.Count);
             *                      double baseFreq = baseline.GetKeyFracLaplace(kvp.Key, smoothingAmt);
             *
             *                      //Console.WriteLine ("Class: " +  classIndex + " Kmer: " + kvp.Value + ", class freq " + thisFreq + ", base freq " + baseFreq);
             *
             *                      //TODO: Advanced logic.
             *                      if(thisFreq > baseFreq){
             *                              double kmerValue = thisFreq - baseFreq;
             *                              //Console.WriteLine ("Adding kmer " + kvp.Key + " weight " + kmerValue + " for class " + classIndex);
             *                              thisClassCharacteristicKmersStore.Add (new TupleStruct<Kmer<Ty>, double>(kvp.Key, kmerValue));
             *                      }
             *              }
             *      }
             *      foreach(TupleStruct<Kmer<Ty>, double> kmerToAdd in thisClassCharacteristicKmersStore.OrderBy (tup => Math.Abs (tup.Item2)).Take ((int)kmersToTake)){ //TODO: Unordered kth order statistic.
             *              learnedCharacteristicKmers.GetWithDefaultAndAdd(kmerToAdd.Item1, () => new Dictionary<int, double>(classCount))[classIndex] = kmerToAdd.Item2;
             *      }
             * }
             */

            //Parallelized (find characteristic kmers for each class in parallel)

            IEnumerable <TupleStruct <int, IEnumerable <TupleStruct <Kmer <Ty>, double> > > > characteristicKmers = Enumerable.Range(0, classCount).AsParallel().Select(index => new TupleStruct <int, IEnumerable <TupleStruct <Kmer <Ty>, double> > > (index, ExtractCharacteristicKmersForClass(index, classes[index].Item2, baseline)));

            //Discard empty features.
            if (discardEmptyFeatures)
            {
                characteristicKmers = characteristicKmers.ToArray();
                bool[] classFound = new bool[classCount];
                int    foundCount = 0;
                foreach (var v in characteristicKmers)
                {
                    if (!classFound[v.Item1])
                    {
                        classFound[v.Item1] = true;
                        foundCount++;
                        if (foundCount == classCount)
                        {
                            break;
                        }
                    }
                }
                if (foundCount < classCount)
                {
                    string[] newClasses = classes.Where((@class, index) => classFound[index]).Select(@class => @class.Item1).ToArray();
                    Dictionary <string, int> newClassLookup = newClasses.IndexLookupDictionary();

                    int[] oldToNewMapping = new int[classes.Length];
                    foreach (string s in classes.Select(@class => @class.Item1))
                    {
                        oldToNewMapping[classLookup[s]] = newClassLookup.GetWithDefault(s, 0);
                    }

                    characteristicKmers = characteristicKmers.Select(kmer =>
                                                                     new TupleStruct <int, IEnumerable <TupleStruct <Kmer <Ty>, double> > >
                                                                         (oldToNewMapping[kmer.Item1], kmer.Item2));
                    //classes = newClasses; //TODO: May need this for negative kmers.
                    classLookup = newClassLookup;
                    classCount  = foundCount;

                    features = @newClasses.Order().ToArray();
                }
                else
                {
                    features = @classes.Select(@class => @class.Item1).Order().ToArray();
                }
            }
            else
            {
                features = @classes.Select(@class => @class.Item1).Order().ToArray();
            }

            //This part probably can't be parallelized (adding to same dictionary), but should be light
            learnedCharacteristicKmers = new Dictionary <Kmer <Ty>, Dictionary <int, double> >();

            foreach (TupleStruct <int, IEnumerable <TupleStruct <Kmer <Ty>, double> > > thisClass in characteristicKmers)       //TODO: Unordered kth order statistic.

            //Console.WriteLine ("Class " + thisClass.Item1 + " contains " + thisClass.Item2.Count() + " above average kmers.");


            {
                int thisClassIndex = thisClass.Item1;
                IEnumerable <TupleStruct <Kmer <Ty>, double> > thisClassCharacteristicKmersStore = thisClass.Item2;
                foreach (TupleStruct <Kmer <Ty>, double> kmerToAdd in thisClassCharacteristicKmersStore)
                {
                    learnedCharacteristicKmers.GetWithDefaultAndAdd(
                        kmerToAdd.Item1,
                        () => new Dictionary <int, double>(classCount / 2)                         //TODO: Dictionary size, we're guessing is classCount / 2, this is a bad heuristic.
                        )[thisClassIndex] = kmerToAdd.Item2;
                }
            }

            //TODO: Negative Kmers (note, may complicate sizing.  Will not work without a lot of data.)


            //TODO: Put this back in.
            //features.MapInPlace(fname => ClassificationCriterion + ":" + fname); //Add classification criterion to feature names.
        }
예제 #16
0
 //CLASSIFICATION:
 public static string ClassifyDataSet <Ty>(IFeatureSynthesizer <Ty> synth, DiscreteSeriesDatabase <Ty> db, string nameField)
 {
     return(db.data.AsParallel().Select(item => ClassifyItem(synth, item, nameField)).FoldToString());
 }
예제 #17
0
 protected abstract IEventSeriesScalarRegressor <Ty>[] CreateRegressors(DiscreteSeriesDatabase <Ty> data);
예제 #18
0
 public void Train(DiscreteSeriesDatabase <Ty> data)
 {
     synth.Train(data);
 }
예제 #19
0
        public static DiscreteSeriesDatabase <string> getNewsDataset(string fileName, int count = 0)
        {
            DiscreteSeriesDatabase <string> data = new DiscreteSeriesDatabase <string> ();

            using (StreamReader keyfile = File.OpenText(fileName + "key")) {
                if (count > 0)
                {
                    keyfile.BaseStream.Seek(-107 * count, System.IO.SeekOrigin.End);                      //avg line is ~81 characters.
                    keyfile.ReadLine();
                }
//				for(int i = 0; i < 8000; i++) keyfile.ReadLine ();
                data.LoadTextDatabase(fileName + "/", keyfile, DatabaseLoader.ProcessEnglishText, 1);
            }

            //Do some processing on the database
            foreach (DiscreteEventSeries <string> item in data.data)
            {
                string author = AsciiOnly(item.labels ["author"], false).RegexReplace(@"_+", @" ").RegexReplace(@"(?:[<])|(?:^[ ,])|(?:$)|(?:\')|(?:\\)", "").RegexReplace(@"([#$&])", @"\$1");
                author = manualRenames.GetWithDefault(author, author);

                if (author.StartsWith(@" "))                    //TODO: Why is this not caught by the regex?
                {
                    author = author.Substring(1);
                }
                if (invalidAuthors.Contains(author))
                {
                    //Console.WriteLine ("REMOVED " + author);
                    item.labels.Remove("author");
                }
                else
                {
                    item.labels ["author"] = NameCase(author);                     //Put the formatting done above back into db

                    string[] authSplit = author.Split(' ');
                    string   firstName = authSplit[0].ToLower();
                    if (titles.Contains(firstName) && authSplit.Length > 1)
                    {
                        if (authSplit.Length == 2)
                        {
                            //Just a last name.
                            firstName = "a";                             //Will be marked neutral.
                        }
                        else
                        {
                            firstName = authSplit[1];
                        }
                    }

                    if (neutralNames.Contains(firstName) || firstName.Length == 1)
                    {
                        //Gender unknown
                    }
                    else if (maleNames.Contains(firstName) || firstName.EndsWith("ndra"))
                    {
                        item.labels["gender"] = "male";
                    }
                    else if (firstName[firstName.Length - 1] == 'a' || firstName.EndsWith("ee") || femaleNames.Contains(firstName))
                    {
                        item.labels["gender"] = "female";
                    }
                    else if ("eiou".Contains(firstName[firstName.Length - 1]))
                    {
                        //Gender unknown (suspected female)
                    }
                    else if (firstName.Length > 1)
                    {
                        item.labels["gender"] = "male";
                    }
                }

                item.labels ["filename"] = item.labels ["filename"].Replace("_", " ").RegexReplace("([#$&])", "\\$1");
                if (item.labels.ContainsKey("location"))
                {
                    item.labels ["location"] = item.labels ["location"].Replace("_", " ").RegexReplace("([#$&])", "\\$1");
                    item.labels ["location"] = manualLocationRenames.GetWithDefault(item.labels["location"], item.labels["location"]);
                    item.labels ["location"] = NameCase(item.labels ["location"]);
                }
            }

            return(data);
        }
예제 #20
0
        /*
         * public void TestNewClassifiers(){
         *
         * }
         */

        public static void TestLatex()
        {
            bool test      = true;
            bool shorten   = true;
            bool costarica = true;
            bool cuba      = true;

            if (test)
            {
                costarica = cuba = false;
            }

            DiscreteSeriesDatabase <string> allData = LoadRegionsDatabase(test, shorten, costarica, cuba);


            /*
             * IFeatureSynthesizer<string> testSynth = new VarKmerFrequencyFeatureSynthesizer<string>("region", 3, 4, 50, 2.0, true);
             * testSynth.Train (allData);
             *
             * Console.WriteLine (testSynth.GetFeatureSchema().FoldToString ());
             * Console.WriteLine (testSynth.SynthesizeFeaturesSumToOne(new DiscreteEventSeries<string>(allData.data.First ().labels, allData.data.First ().Take (25).ToArray ())).FoldToString (d => d.ToString ("F3")));
             * Console.ReadLine ();
             */

            /*
             * if(test){
             *      allData = allData.SplitDatabase (.25).Item1;
             * }
             */


            //TODO: Add length distribution for documents and each type.

            //Create a feature synthesizer

            //IFeatureSynthesizer<string> synth = new RegressorFeatureSynthesizerKmerFrequenciesVarK<string>("region", 8, 2, 100, 3); //Slowld way
            //IFeatureSynthesizer<string> synth = new VarKmerFrequencyFeatureSynthesizer<string>("region", 3, 4, 50, 2.0, true);

            //IEventSeriesProbabalisticClassifier<string> textClassifier // = TextClassifierFactory.TextClassifier ("region", new[]{"region", "type"});

            //string documentTitle, string author, int width, int height, string outFile, IEnumerable<Tuple<string, IEventSeriesProbabalisticClassifier<Ty>>> classifiers, string datasetTitle, DiscreteSeriesDatabase<Ty> dataset, string criterionByWhichToClassify
            //IEnumerable<Tuple<string, IEventSeriesProbabalisticClassifier<string>>> classifiers = TextClassifierFactory.RegionsTestClassifiers().ToArray ();
            IEnumerable <Tuple <string, IEventSeriesProbabalisticClassifier <string> > > classifiers = TextClassifierFactory.RegionsPerceptronTestClassifiers().ToArray();

            IFeatureSynthesizer <string> synthesizer = new CompoundFeatureSynthesizer <string>(
                "region",
                new IFeatureSynthesizer <string>[] {
                new VarKmerFrequencyFeatureSynthesizerToRawFrequencies <string>("region", 2, 2, 16, .1, false),
                new LatinLanguageFeatureSynthesizer("region"),
                new VarKmerFrequencyFeatureSynthesizer <string>("region", 3, 4, 50, 2.0, false),
                new VarKmerFrequencyFeatureSynthesizer <string>("type", 3, 3, 50, 2.0, false)
            }
                );


            if (test)
            {
                classifiers = classifiers.Take(2);
            }



            WriteupGenerator.ProduceClassifierComparisonWriteup <string>("Spanish Language Dialect Analysis", "Cyrus Cousins", 11, 16, "../../out/spanish/classification/", classifiers.ToArray(), "Spanish Language", allData, "region", test ? 1 : 4, analysisCriteria: new[] { "region", "type" }, synthesizer: synthesizer);

            /*
             * if (classifier is SeriesFeatureSynthesizerToVectorProbabalisticClassifierEventSeriesProbabalisticClassifier<string>) {
             *      IFeatureSynthesizer<string> synthesizer = ((SeriesFeatureSynthesizerToVectorProbabalisticClassifierEventSeriesProbabalisticClassifier<string>)classifier).synthesizer;
             *
             *      //doc.Append ("\\section{Feature Synthesizer Analysis}\n\n");
             *      //doc.Append (synthesizer.FeatureSynthesizerLatexString(allData));
             * }
             */
        }
예제 #21
0
        //Load a database file from a stream.

        public static void LoadTextDatabase(this DiscreteSeriesDatabase <string> db, TextReader inStream, Func <string, string> textProcessor)
        {
            LoadTextDatabase(db, "", inStream, textProcessor);
        }
예제 #22
0
        public static IFeatureSynthesizer <string> deriveOptimalClassifier()
        {
            //Load databases
            DiscreteSeriesDatabase <string> allData = LoadRegionsDatabase();

            Tuple <DiscreteSeriesDatabase <string>, DiscreteSeriesDatabase <string> > split = allData.SplitDatabase(.8);

            DiscreteSeriesDatabase <string> trainingData = split.Item1;
            DiscreteSeriesDatabase <string> testData     = split.Item2;

            string cat = "region";

            double optimalScore = 0;
            IFeatureSynthesizer <string> optimalClassifier = null;
            string optimalInfoStr = null;

            //Preliminary scan

            int[] ks = new int[] { 2, 3, 4 };
            //int[] minCutoffs = new int[]{5, 10, 20};
            int[] minCutoffs       = new int[] { 10 };
            int[] kmerCounts       = new int[] { 10, 25, 50, 100 };
            int[] smoothingAmounts = new int[] { 1, 5, 10 };

            string[] colNames = "k minCutoff kmerCount smoothingAmount score".Split(' ');

            Console.WriteLine(colNames.FoldToString("", "", ","));

            foreach (int k in ks)
            {
                foreach (int minCutoff in minCutoffs)
                {
                    foreach (int kmerCount in kmerCounts)
                    {
                        foreach (int smoothingAmount in smoothingAmounts)
                        {
                            IFeatureSynthesizer <string> classifier = new RegressorFeatureSynthesizerKmerFrequenciesVarK <string>(cat, minCutoff, smoothingAmount, kmerCount, k);
                            classifier.Train(trainingData);

                            double score = classifier.ScoreModel(testData);

                            string infoStr = new double[] { k, minCutoff, kmerCount, smoothingAmount, score }.FoldToString("", "", ",");

                            Console.WriteLine(infoStr);
                            if (score > optimalScore)
                            {
                                optimalScore      = score;
                                optimalClassifier = classifier;
                                optimalInfoStr    = infoStr;
                            }
                        }
                    }
                }
            }

            Console.WriteLine("Optimal Classifier:");
            Console.WriteLine(optimalInfoStr);
            Console.WriteLine(optimalClassifier);

            return(optimalClassifier);
        }
예제 #23
0
        public static DiscreteSeriesDatabase <string> LoadRegionsDatabase(bool test = false, bool shorten = false, bool costarica = true, bool cuba = true)
        {
            //Load training data and create classifier.

            string directory = "../../res/regiones/";

            string[] regions = "españa argentina méxico colombia".Split(' ');

            string file = "";

            if (costarica)
            {
                regions = "costarica".Cons(regions).ToArray();
            }
            if (cuba)
            {
                regions = "cuba".Cons(regions).ToArray();
            }

            //string[] prefixes = new[]{"", "literatura", "historia", "lengua"};
            //file += prefixes.Select (prefix => regions.FoldToString ((sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "news" + " " + prefix + val, "", "", "\n")).FoldToString ("", "", "\n");

            file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "news" + " " + val + "\n");
            file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "wiki" + " " + "literatura" + val + "\n");
            file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "wiki" + " " + "historia" + val + "\n");
            file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "wiki" + " " + "lengua" + val + "\n");
            file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "receta" + " " + "recetas" + val + "\n");

            if (!test)
            {
                {
                    string[] literatureRegions = "costarica costarica españa españa españa argentina argentina argentina argentina argentina argentina españa españa españa españa méxico méxico méxico méxico méxico méxico méxico colombia colombia colombia colombia colombia".Split(' ');
                    string[] literatureNames   = "leyendascr elisadelmar juanvaleraavuelaplumaespaña juanvaleraloscordobesesespaña marianela historiauniversal lamuerte buenosaires derroterosyviages fundaciondelaciudad laargentina mosenmillan historiadejudios viajosporespaña recuerdosybellezas leyendasmayas nahuatl laberinto comoaguaparachocolate mitoshorroresmexicanos leyendasmexicanas mitosurbanesmexicanos lamultituderrante viajoscolombianos leyendasurbanascolombianas mitoscolombianos mitoscolombianos2".Split(' ');

                    IEnumerable <string> classesStrings = literatureRegions.Select(r => "region:" + r + ";" + "type:" + "literature");

                    file += classesStrings.Zip(literatureNames, (thisClasses, thisPath) => thisClasses + " " + thisPath).Aggregate(new StringBuilder(), (sum, val) => sum.Append(val).Append("\n"));
                }

                {
                    string[] names = (
                        "salud antologia9 escorpionescr teca vacunoscr lanación universidadcr recetascostarica2 recetascostarica3 crcrawl presidentecostarica gobiernocostarica " +
                        "arqueologiamaya poesiamexicana catolicismosocial unam mxcrawl cocrawl cocrawl2 desplazadoscolombianos mexicocnn méxicolgbt méxicogob historiaazteca historiaazteca2 " +
                        "ordenamientoterretorrial competitividad ministerio"
                        ).Split(' ');
                    string[] tags = (
                        "region:costarica region:costarica region:costarica region:costarica region:costarica;type:paper region:costarica;type:news region:costarica region:costarica;type:receta region:costarica;type:receta region:costarica;type:website region:costarica;type:wiki region:costarica;type:wiki " +
                        "region:méxico region:méxico;type:paper region:méxico;type:paper region:méxico;type:paper region:méxico;type:website region:colombia;type:website region:colombia;type:website region:colombia;type:wiki region:méxico;type:news region:méxico;type:brochure region:méxico;type:website region:méxico region:méxico " +
                        "region:colombia region:colombia region:colombia"
                        ).Split(' ');

                    file += tags.Zip(names, (tag, name) => tag + " " + name).FoldToString("", "\n", "\n");
                }
            }

            if (cuba)
            {
                file += "region:cuba;type:wiki cubaisla\n";
                file += "region:cuba;type:receta recetascuba2\n";
                file += "region:cuba;type:receta recetascuba3\n";
                file += "region:cuba;type:literatura lahistoriame\n";
                file += "region:cuba;type:literatura elencuentro\n";
            }

            Console.WriteLine("Regions Database:");
            Console.WriteLine(file);

            TextReader reader = new StringReader(file);

            DiscreteSeriesDatabase <string> d = new DiscreteSeriesDatabase <string> ();

            d.LoadTextDatabase(directory, reader, DatabaseLoader.ProcessSpanishText, 3);

            if (shorten)
            {
                d = new DiscreteSeriesDatabase <string>(d.Select(item => new DiscreteEventSeries <string>(item.labels, item.data.Take(750).ToArray())));
            }

            return(d);
        }
예제 #24
0
        //Construction:

        //Train an IFeatureSynthesizer model.
        public void Train(DiscreteSeriesDatabase <Ty> data)
        {
            regressors = CreateRegressors(data);
        }