public void Train(DiscreteSeriesDatabase <Ty> series) { //TODO: Sharing this data like this may be detrimental. //TODO: Boolean for whether the synthesizer needs to be trained. synthesizer.Train(series); classifier.Train(series //.AsParallel().AsOrdered() //Parallel causes a bug where not all of the items are always reached. .Where(item => item.labels.ContainsKey(synthesizer.ClassificationCriterion)) .Select(item => new LabeledInstance(item.labels[synthesizer.ClassificationCriterion], synthesizer.SynthesizeFeatures(item))).ToArray() //But this seems to fix it? ); //There is a bug, this code tests for a failure. if (classifier is NullProbabalisticClassifier) { string[] synthFeatures = synthesizer.GetFeatureSchema(); string[] classifierClasses = classifier.GetClasses(); if (synthFeatures.Length != classifierClasses.Length || !synthFeatures.Zip(classifierClasses, (s, c) => (s == c)).Conjunction()) { Console.WriteLine("A catastrophic error has occured in the Null Probabalistic Classifier. Feature schema:"); Console.WriteLine(synthFeatures.FoldToString()); Console.WriteLine("But classifier (NullProbabalisticClassifier):"); Console.WriteLine(classifierClasses.FoldToString()); Console.WriteLine("Training Names:"); string[] trainingNames = series.Where(item => item.labels.ContainsKey(synthesizer.ClassificationCriterion)).Select(item => item.labels[synthesizer.ClassificationCriterion]).Distinct().Order().ToArray(); Console.WriteLine(trainingNames.FoldToString()); Console.WriteLine("synthesizer, classifier, training: " + synthFeatures.Length + ", " + classifierClasses.Length + ", " + trainingNames.Length); Console.WriteLine("All Training Data:"); Console.WriteLine(series.FoldToString(item => item.labels.GetWithDefault(synthesizer.ClassificationCriterion, "[none]"))); Console.Write(""); } } }
//Train an IFeatureSynthesizer model. //This function shall be called before SynthesizeFeatures iff NeedsTraining public void Train(DiscreteSeriesDatabase <Ty> data) { foreach (IFeatureSynthesizer <Ty> synth in synths) { if (synth.NeedsTraining) { synth.Train(data); } } }
public ClassifierAccuracyAnalysis(IEventSeriesProbabalisticClassifier <Ty> classifier, string classifierName, DiscreteSeriesDatabase <Ty> labeledData, string criterionByWhichToClassify, double trainSplitFrac, int iterations, double bucketSize) { this.classifier = classifier; this.classifierName = classifierName; this.labeledData = labeledData; this.criterionByWhichToClassify = criterionByWhichToClassify; this.trainSplitFrac = trainSplitFrac; this.iterations = iterations; this.bucketSize = bucketSize; }
public static void runNewsClassifierDerivation(string inFile, string outDirectory, int count, int iterations) { //Load the database: DiscreteSeriesDatabase <string> data = getNewsDataset(inFile, count); //data = data.SplitDatabase (.1).Item1; IEnumerable <Tuple <string, IEventSeriesProbabalisticClassifier <string> > > classifiers = TextClassifierFactory.NewsTestClassifiers().Concat(TextClassifierFactory.NewsTestAdvancedClassifiers().Skip(1)); IFeatureSynthesizer <string> synth = new CompoundFeatureSynthesizer <string>("author", new IFeatureSynthesizer <string>[] { new VarKmerFrequencyFeatureSynthesizer <string>("author", 3, 2, 60, 0.7, false), new VarKmerFrequencyFeatureSynthesizer <string>("location", 3, 3, 50, 1, false), new VarKmerFrequencyFeatureSynthesizer <string>("gender", 3, 8, 50, 10, false), new DateValueFeatureSynthesizer("date"), new LatinLanguageFeatureSynthesizer("author") }); WriteupGenerator.ProduceClassifierComparisonWriteup <string>("Classifier Comparison Analysis on Ekantipur News Articles", "Cyrus Cousins with Shirish Pokharel", 20, 20, outDirectory, classifiers.ToArray(), "News", data, "author", iterations, new[] { "author", "location", "date", "gender" }, synth); }
protected override IEventSeriesScalarRegressor <Ty>[] CreateRegressors(DiscreteSeriesDatabase <Ty> data) { //Partition into class and classless groups. Tuple <IEnumerable <DiscreteEventSeries <Ty> >, IEnumerable <DiscreteEventSeries <Ty> > > partitioned = data.data.Partition(item => item.labels.ContainsKey(ClassificationCriterion)); IEnumerable <DiscreteEventSeries <Ty> > noClass = partitioned.Item2; //This item does not have a class over the category label for which the feature synthezer is being created. IEnumerable <DiscreteEventSeries <Ty> > inClass = partitioned.Item1; IEnumerable <IGrouping <string, DiscreteEventSeries <Ty> > > groupings = inClass.GroupBy(item => item.labels[ClassificationCriterion]); //Establish multisets for each class IEnumerable <Tuple <string, Multiset <Ty> > > classSets = groupings.Map(grp => Tuple.Create(grp.Key, grp.ToMultiset())).ToArray(); //Used twice. Make it an array. //Establish the baseline (all data) Multiset <Ty> baseline = noClass.ToMultiset().Cons(classSets.Select(a => a.Item2)).MultisetUnion(); return(classSets.Map(ntp => new ItemFrequencyRegressor <Ty>(ntp.Item1, minSignificantCount, smoothingAmount, featuresToUse, baseline, ntp.Item2)).ToArray()); }
//Train an IFeatureSynthesizer model. public void Train(DiscreteSeriesDatabase <Ty> trainingData) { //Partition into class and classless groups. Tuple <IEnumerable <DiscreteEventSeries <Ty> >, IEnumerable <DiscreteEventSeries <Ty> > > partitioned = trainingData.data.Partition(item => item.labels.ContainsKey(ClassificationCriterion)); IEnumerable <DiscreteEventSeries <Ty> > classedSeries = partitioned.Item1; IEnumerable <DiscreteEventSeries <Ty> > classlessSeries = partitioned.Item2; //These items does not have a class over the category label for which the feature synthezer is being created. TupleStruct <string, MultisetKmer <Ty> >[] classes = classedSeries.AsParallel() .GroupBy(series => series.labels[ClassificationCriterion]) //Group by class .Select(grp => new TupleStruct <string, MultisetKmer <Ty> >(grp.Key, ((IEnumerable <DiscreteEventSeries <Ty> >)grp).ToMultisetVarKmer <Ty> (k))) //Make classes into a single multiset each. .OrderBy(tup => tup.Item1) //Sort by name .ToArray(); /* * Console.WriteLine("GROUPS"); * foreach(var v in classes){ * Console.WriteLine ("Class " + v.Item1 + " size " + v.Item2.Size ()); * } */ MultisetKmer <Ty> baseline; if (useUncategorizedForBaseline) { baseline = classlessSeries.ToMultisetVarKmer(k).Cons(classes.Select(@class => @class.Item2)).MultisetKmerUnion(); //TODO reuse the classless multiset. } else { baseline = classedSeries.ToMultisetVarKmer(k); } //We now have data for all classes and the baseline. IEnumerable <TupleStruct <Kmer <Ty>, double> > characteristicKmers = Enumerable.Range(0, classes.Length).AsParallel().SelectMany(index => ExtractCharacteristicKmersForClass(index, classes[index].Item2, baseline)); //Lookup for all kmers. kmersOntoIndex = characteristicKmers.OrderByDescending(item => item.Item2).Select(item => item.Item1).Distinct().Take((int)kmersToTake).IndexLookupDictionary(); kmerCount = kmersOntoIndex.Count; }
public static void runNewsClassification(string inFile, string outDirectory, int count, int iterations) { DiscreteSeriesDatabase <string> data = getNewsDataset(inFile, count); //Create the classifier /* * IEventSeriesProbabalisticClassifier<string> classifier = new SeriesFeatureSynthesizerToVectorProbabalisticClassifierEventSeriesProbabalisticClassifier<string>( * new VarKmerFrequencyFeatureSynthesizer<string>("author", 3, 2, 60, 0.1, false), * new NullProbabalisticClassifier() * ); */ IEventSeriesProbabalisticClassifier <string> classifier = new SeriesFeatureSynthesizerToVectorProbabalisticClassifierEventSeriesProbabalisticClassifier <string>( new VarKmerFrequencyFeatureSynthesizer <string>("author", 3, 2, 50, 0.6, false), new PerceptronCloud(16.0, PerceptronTrainingMode.TRAIN_ALL_DATA, PerceptronClassificationMode.USE_NEGATIVES | PerceptronClassificationMode.USE_SCORES, 1.5, false) ); //string documentTitle, string author, int width, int height, string outFile, IEventSeriesProbabalisticClassifier<Ty> classifier, DiscreteEventSeries<Ty> dataset, string datasetTitle, string criterionByWhichToClassify WriteupGenerator.ProduceClassificationReport <string>("Analysis and Classification of " + data.data.Count + " Ekantipur Articles", "Cyrus Cousins with Shirish Pokharel", 20, 20, outDirectory, classifier, "characteristic kmer classifier", data, "News", "author", iterations); }
public static double ScoreModel <Ty> (this IFeatureSynthesizer <Ty> synth, DiscreteSeriesDatabase <Ty> testData, int verbosity, string nameCategory = null) { Dictionary <string, int> classRanks = synth.GetFeatureSchema().Select((item, index) => new Tuple <string, int> (item, index)).ToDictionary(a => a.Item1, a => a.Item2); //Display schema if (verbosity >= 2) { Console.WriteLine(synth.GetFeatureSchema().FoldToString()); } double score = testData.data.AsParallel() .Where(item => classRanks.ContainsKey(item.labels.GetWithDefault(synth.ClassificationCriterion, ""))) //Filter for items for which we have regressors for. .Select(i => ScoreModelSingle(synth, classRanks, i, verbosity, nameCategory)).Average(); //Score them and take the average. if (verbosity >= 2) { Console.WriteLine("Total Score = " + score); Console.WriteLine("E[random model score] = " + (1.0 / classRanks.Count)); } return(score); }
public static void LoadTextDatabase(this DiscreteSeriesDatabase <string> db, string directory, TextReader inStream, Func <string, string> textProcessor, int logLevel = 0) //TODO make this an extension method for DiscreteSeries<string> ? { List <DiscreteEventSeries <string> > fileData = db.data; //Read the file in string s = inStream.ReadToEnd(); string[] entries = s.Split(newLine, StringSplitOptions.RemoveEmptyEntries); //TODO: Zip archive version of the following. //Read each file List <DiscreteEventSeries <string> > items = entries.AsParallel().Select(entry => { Dictionary <string, string> entryDict = processEntryLine(entry, logLevel); return(processEntryFromFile(directory, entryDict, logLevel, textProcessor)); }).Where(entry => entry != null).ToList(); fileData.AddRange(items); //Print some information about what has been read. if (logLevel >= 1) { Console.WriteLine("Loaded " + items.Count + " / " + entries.Length + " discrete event series. " + items.TotalItemCount() + " total words added."); if (logLevel >= 3) { IEnumerable <string> categoryKeys = items.SelectMany(item => item.labels.Keys).Distinct().Where(item => item != "filename"); foreach (string key in categoryKeys) { Console.WriteLine("Classification Criterion: " + key); Console.WriteLine(items.GroupBy(item => item.labels.GetWithDefault(key, "[none]")) //Group by category .FoldToString(item => item.Key + " (" + item.Select(subitem => subitem.data.Length).Sum() + " words): " //Count words per category + item.FoldToString(subitem => subitem.labels["filename"] + " (" + subitem.data.Length + " words)"), "", "", "\n")); //Show each item in category. } } } }
public static void TestNewDesign() { DiscreteSeriesDatabase <string> allData = LoadRegionsDatabase(); Tuple <DiscreteSeriesDatabase <string>, DiscreteSeriesDatabase <string> > split = allData.SplitDatabase(.8); DiscreteSeriesDatabase <string> trainingData = split.Item1; DiscreteSeriesDatabase <string> testData = split.Item2; IFeatureSynthesizer <string> synth = new RegressorFeatureSynthesizerKmerFrequenciesVarK <string>("region", 8, 2, 100, 3); //IFeatureSynthesizer<string> synth = new RegressorFeatureSynthesizerKmerFrequencies<string>("region", 4, 10, 100, 3); //IFeatureSynthesizer<string> synth = new RegressorFeatureSynthesizerFrequencies<string>("region", 4, 10, 100); synth.Train(trainingData); Console.WriteLine(synth.ToString()); synth.ScoreModel(testData, 2, "filename"); Console.WriteLine(ClassifyDataSet(synth, testData, "filename")); //TODO may be good to use something unspecifiable in the file syntax such as "filename;" //Console.WriteLine (allData.DatabaseLatexString("Regional Spanish Database")); }
public void Train(DiscreteSeriesDatabase <string> data) { throw new Exception("Cannot train a TextFeatureSynthesizer."); }
public ClassifierAccuracyAnalysis <Ty> runAccuracyAnalysis() { string nameCriterion = "filename"; //TODO: Input parameter or constant for this. //Filter out items not labeled for this criterion. labeledData = labeledData.FilterForCriterion(criterionByWhichToClassify); // datasetSchema = labeledData.getLabelClasses(criterionByWhichToClassify).Order().ToArray(); //Create mapping from strings to indices schemaMapping = datasetSchema.IndexLookupDictionary(); bucketCount = (int)(1.0 / bucketSize); classCount = datasetSchema.Length; //Raw data classifications: //Name, true class, predicted class, scores, winning score; classificationInstances = new List <Tuple <string, string, string, double[], double> > (); if (testOverfitting) { trainingDataClassificationInstances = new List <Tuple <string, string, string, double[], double> > (); } string classifierName = "\"" + AlgorithmReflectionExtensions.GetAlgorithmName(classifier) + "\""; Console.WriteLine("Running classifier " + classifierName + " on " + labeledData.data.Count + " items."); //TODO: Not a bad idea to duplicate the classifiers to increase parallelism. //Run and make classifiers. for (int i = 0; i < iterations; i++) { Console.WriteLine("Classifier Accuracy: Initiating round " + (i + 1) + " / " + iterations + " for " + classifierName + "."); Tuple <DiscreteSeriesDatabase <Ty>, DiscreteSeriesDatabase <Ty> > split = labeledData.SplitDatabase(trainSplitFrac); //TODO: Vary this? DiscreteSeriesDatabase <Ty> training = split.Item1; DiscreteSeriesDatabase <Ty> test = split.Item2; classifier.Train(training); string[] classifierSchema = classifier.GetClasses(); classificationInstances.AddRange(test.data.AsParallel().WithExecutionMode(ParallelExecutionMode.ForceParallelism).Select(item => classificationInfo(classifier, classifierSchema, schemaMapping, item, nameCriterion, criterionByWhichToClassify))); if (testOverfitting) { trainingDataClassificationInstances.AddRange(training.data.Take((int)(overfittingTestFrac * training.data.Count)).AsParallel().WithExecutionMode(ParallelExecutionMode.ForceParallelism).Select(item => classificationInfo(classifier, classifierSchema, schemaMapping, item, nameCriterion, criterionByWhichToClassify))); } } //Confusion Matrices. confusionMatrixCounts = new int[classCount, classCount]; // [a,b] : How often a is classified as b confusionMatrixScores = new double[classCount, classCount]; //Confusion matrix allocation countsConfusionMatricesByConfidence = new int[bucketCount][, ]; scoresConfusionMatricesByConfidence = new double[bucketCount][, ]; for (int i = 0; i < bucketCount; i++) { countsConfusionMatricesByConfidence [i] = new int[classCount, classCount]; scoresConfusionMatricesByConfidence [i] = new double[classCount, classCount]; } //Confusion Matrix population foreach (var classification in classificationInstances) { int confidenceBucket = Math.Min((int)Math.Floor(classification.Item5 * bucketCount), bucketCount - 1); //On a score of 1 or greater, clip to the top bucket. Highest confidence is always positive because confidences sum to 1. //Counts confusionMatrixCounts [schemaMapping [classification.Item2], schemaMapping [classification.Item3]]++; countsConfusionMatricesByConfidence [confidenceBucket] [schemaMapping [classification.Item2], schemaMapping [classification.Item3]]++; //Scores for (int j = 0; j < classCount; j++) { confusionMatrixScores [schemaMapping [classification.Item2], j] += classification.Item4 [j]; scoresConfusionMatricesByConfidence [confidenceBucket] [schemaMapping [classification.Item2], j] += classification.Item4 [j]; } } //Aggregates countColumnSums = Enumerable.Range(0, classCount).Select(i => (double)confusionMatrixCounts.SumColumn(i)).ToArray(); countRowSums = Enumerable.Range(0, classCount).Select(i => (double)confusionMatrixCounts.SumRow(i)).ToArray(); scoreColumnSums = Enumerable.Range(0, classCount).Select(i => confusionMatrixScores.SumColumn(i)).ToArray(); scoreRowSums = Enumerable.Range(0, classCount).Select(i => confusionMatrixScores.SumRow(i)).ToArray(); classCountAccuracies = Enumerable.Range(0, classCount).Select(c => confusionMatrixCounts [c, c] / (double)countRowSums [c]).ToArray(); overallAccuracy = Enumerable.Range(0, classCount).Select(i => (double)confusionMatrixCounts [i, i]).Sum() / classificationInstances.Count; expectedAccuracyRandom = (1.0 / classCount); topClassSelectionAccuracy = labeledData.GroupBy(item => item.labels[criterionByWhichToClassify]).Select(grp => grp.Count()).Max() / (double)labeledData.data.Count; //Safety check. { double countSum = countColumnSums.Sum(); double scoreSum = scoreColumnSums.Sum(); //These should all be the same ass instancesClassifiedCount, to within numerics errors. Trace.Assert(Math.Abs(countSum - classificationInstances.Count) < .00001); Trace.Assert(Math.Abs(scoreSum - classificationInstances.Count) < .00001); } //Class, Confidence Bucket accuracyByTrueClassAndConfidence = new double[classCount + 1, bucketCount]; accuracyByPredictedClassAndConfidence = new double[classCount + 1, bucketCount]; for (int i = 0; i < bucketCount; i++) { for (int j = 0; j < classCount; j++) { accuracyByTrueClassAndConfidence [j + 1, i] = (double)countsConfusionMatricesByConfidence [i] [j, j] / (double)countsConfusionMatricesByConfidence [i].SumRow(j); accuracyByPredictedClassAndConfidence [j + 1, i] = (double)countsConfusionMatricesByConfidence [i] [j, j] / (double)countsConfusionMatricesByConfidence [i].SumColumn(j); } //TODO: Never use a try catch block, and punish those who do. try { accuracyByTrueClassAndConfidence [0, i] = Enumerable.Range(1, classCount).Select(j => accuracyByTrueClassAndConfidence [j, i]).Where(val => !Double.IsNaN(val)).Average(); } catch { accuracyByTrueClassAndConfidence [0, i] = Double.NaN; } try { accuracyByPredictedClassAndConfidence [0, i] = Enumerable.Range(1, classCount).Select(j => accuracyByTrueClassAndConfidence [j, i]).Where(val => !Double.IsNaN(val)).Average(); } catch { accuracyByPredictedClassAndConfidence [0, i] = Double.NaN; } } //For use in math mode elements and matrices. datasetSchemaText = datasetSchema.Select(item => @"\text{" + LatexExtensions.limitLength(item, 15) + "}").ToArray(); datasetSchemaTextRotated = datasetSchemaText.Select(item => @"\begin{turn}{70} " + item + @" \end{turn}").ToArray(); //TODO: Limiting length could cause duplication return(this); }
public static double ScoreModelType <Ty>(IEnumerable <string> categoryLabels, Func <string, IFeatureSynthesizer <Ty> > modelGenerator, DiscreteSeriesDatabase <Ty> trainingData, DiscreteSeriesDatabase <Ty> testData) { double sumScore = 0; int count = 0; foreach (string categoryLabel in categoryLabels) { //Train a model for this category label. IFeatureSynthesizer <Ty> model = modelGenerator(categoryLabel); model.Train(trainingData); sumScore += model.ScoreModel(testData); } return(sumScore / count); }
//Score a model. Value returned on [0, 1], where 1 represents a perfectly accurate model and 0 a completely inaccurate model. public static double ScoreModel <Ty> (this IFeatureSynthesizer <Ty> synth, DiscreteSeriesDatabase <Ty> testData) { return(synth.ScoreModel(testData, 1)); }
//Train an IFeatureSynthesizer model. public void Train(DiscreteSeriesDatabase <Ty> trainingData) { //Partition into class and classless groups. Tuple <IEnumerable <DiscreteEventSeries <Ty> >, IEnumerable <DiscreteEventSeries <Ty> > > partitioned = trainingData.data.Partition(item => item.labels.ContainsKey(ClassificationCriterion)); IEnumerable <DiscreteEventSeries <Ty> > classedSeries = partitioned.Item1; IEnumerable <DiscreteEventSeries <Ty> > classlessSeries = partitioned.Item2; //These items does not have a class over the category label for which the feature synthezer is being created. TupleStruct <string, MultisetKmer <Ty> >[] classes = classedSeries.AsParallel() .GroupBy(series => series.labels[ClassificationCriterion]) //Group by class .Select(grp => new TupleStruct <string, MultisetKmer <Ty> >(grp.Key, ((IEnumerable <DiscreteEventSeries <Ty> >)grp).ToMultisetVarKmer <Ty> (k))) //Make classes into a single multiset each. .OrderBy(tup => tup.Item1) //Sort by name .ToArray(); MultisetKmer <Ty> baseline; if (useUncategorizedForBaseline) { baseline = classlessSeries.ToMultisetVarKmer(k).Cons(classes.Select(@class => @class.Item2)).MultisetKmerUnion(); //TODO reuse the classless multiset. } else { baseline = classedSeries.ToMultisetVarKmer(k); } //We now have data for all classes and the baseline. //Create the data structures. classCount = classes.Length; //Lookup for all class strings. classLookup = classes.Select(tup => tup.Item1).IndexLookupDictionary(); //Console.WriteLine ("Training. " + classes.Length + " classes, " + trainingData.data.Count + " instances."); /* Not parallelized: * foreach(int classIndex in Enumerable.Range (0, classCount)){ * //All the kmers found in this class. * TupleStruct<string, MultisetKmer<Ty>> thisClass = classes[classIndex]; * List<TupleStruct<Kmer<Ty>, double>> thisClassCharacteristicKmersStore = new List<TupleStruct<Kmer<Ty>, double>>(); * foreach(KeyValuePair<Kmer<Ty>, int> kvp in thisClass.Item2){ * if(kvp.Value > minKmerCount){ * double thisFreq = kvp.Value / (double) thisClass.Item2.Size (kvp.Key.data.Count); * double baseFreq = baseline.GetKeyFracLaplace(kvp.Key, smoothingAmt); * * //Console.WriteLine ("Class: " + classIndex + " Kmer: " + kvp.Value + ", class freq " + thisFreq + ", base freq " + baseFreq); * * //TODO: Advanced logic. * if(thisFreq > baseFreq){ * double kmerValue = thisFreq - baseFreq; * //Console.WriteLine ("Adding kmer " + kvp.Key + " weight " + kmerValue + " for class " + classIndex); * thisClassCharacteristicKmersStore.Add (new TupleStruct<Kmer<Ty>, double>(kvp.Key, kmerValue)); * } * } * } * foreach(TupleStruct<Kmer<Ty>, double> kmerToAdd in thisClassCharacteristicKmersStore.OrderBy (tup => Math.Abs (tup.Item2)).Take ((int)kmersToTake)){ //TODO: Unordered kth order statistic. * learnedCharacteristicKmers.GetWithDefaultAndAdd(kmerToAdd.Item1, () => new Dictionary<int, double>(classCount))[classIndex] = kmerToAdd.Item2; * } * } */ //Parallelized (find characteristic kmers for each class in parallel) IEnumerable <TupleStruct <int, IEnumerable <TupleStruct <Kmer <Ty>, double> > > > characteristicKmers = Enumerable.Range(0, classCount).AsParallel().Select(index => new TupleStruct <int, IEnumerable <TupleStruct <Kmer <Ty>, double> > > (index, ExtractCharacteristicKmersForClass(index, classes[index].Item2, baseline))); //Discard empty features. if (discardEmptyFeatures) { characteristicKmers = characteristicKmers.ToArray(); bool[] classFound = new bool[classCount]; int foundCount = 0; foreach (var v in characteristicKmers) { if (!classFound[v.Item1]) { classFound[v.Item1] = true; foundCount++; if (foundCount == classCount) { break; } } } if (foundCount < classCount) { string[] newClasses = classes.Where((@class, index) => classFound[index]).Select(@class => @class.Item1).ToArray(); Dictionary <string, int> newClassLookup = newClasses.IndexLookupDictionary(); int[] oldToNewMapping = new int[classes.Length]; foreach (string s in classes.Select(@class => @class.Item1)) { oldToNewMapping[classLookup[s]] = newClassLookup.GetWithDefault(s, 0); } characteristicKmers = characteristicKmers.Select(kmer => new TupleStruct <int, IEnumerable <TupleStruct <Kmer <Ty>, double> > > (oldToNewMapping[kmer.Item1], kmer.Item2)); //classes = newClasses; //TODO: May need this for negative kmers. classLookup = newClassLookup; classCount = foundCount; features = @newClasses.Order().ToArray(); } else { features = @classes.Select(@class => @class.Item1).Order().ToArray(); } } else { features = @classes.Select(@class => @class.Item1).Order().ToArray(); } //This part probably can't be parallelized (adding to same dictionary), but should be light learnedCharacteristicKmers = new Dictionary <Kmer <Ty>, Dictionary <int, double> >(); foreach (TupleStruct <int, IEnumerable <TupleStruct <Kmer <Ty>, double> > > thisClass in characteristicKmers) //TODO: Unordered kth order statistic. //Console.WriteLine ("Class " + thisClass.Item1 + " contains " + thisClass.Item2.Count() + " above average kmers."); { int thisClassIndex = thisClass.Item1; IEnumerable <TupleStruct <Kmer <Ty>, double> > thisClassCharacteristicKmersStore = thisClass.Item2; foreach (TupleStruct <Kmer <Ty>, double> kmerToAdd in thisClassCharacteristicKmersStore) { learnedCharacteristicKmers.GetWithDefaultAndAdd( kmerToAdd.Item1, () => new Dictionary <int, double>(classCount / 2) //TODO: Dictionary size, we're guessing is classCount / 2, this is a bad heuristic. )[thisClassIndex] = kmerToAdd.Item2; } } //TODO: Negative Kmers (note, may complicate sizing. Will not work without a lot of data.) //TODO: Put this back in. //features.MapInPlace(fname => ClassificationCriterion + ":" + fname); //Add classification criterion to feature names. }
//CLASSIFICATION: public static string ClassifyDataSet <Ty>(IFeatureSynthesizer <Ty> synth, DiscreteSeriesDatabase <Ty> db, string nameField) { return(db.data.AsParallel().Select(item => ClassifyItem(synth, item, nameField)).FoldToString()); }
protected abstract IEventSeriesScalarRegressor <Ty>[] CreateRegressors(DiscreteSeriesDatabase <Ty> data);
public void Train(DiscreteSeriesDatabase <Ty> data) { synth.Train(data); }
public static DiscreteSeriesDatabase <string> getNewsDataset(string fileName, int count = 0) { DiscreteSeriesDatabase <string> data = new DiscreteSeriesDatabase <string> (); using (StreamReader keyfile = File.OpenText(fileName + "key")) { if (count > 0) { keyfile.BaseStream.Seek(-107 * count, System.IO.SeekOrigin.End); //avg line is ~81 characters. keyfile.ReadLine(); } // for(int i = 0; i < 8000; i++) keyfile.ReadLine (); data.LoadTextDatabase(fileName + "/", keyfile, DatabaseLoader.ProcessEnglishText, 1); } //Do some processing on the database foreach (DiscreteEventSeries <string> item in data.data) { string author = AsciiOnly(item.labels ["author"], false).RegexReplace(@"_+", @" ").RegexReplace(@"(?:[<])|(?:^[ ,])|(?:$)|(?:\')|(?:\\)", "").RegexReplace(@"([#$&])", @"\$1"); author = manualRenames.GetWithDefault(author, author); if (author.StartsWith(@" ")) //TODO: Why is this not caught by the regex? { author = author.Substring(1); } if (invalidAuthors.Contains(author)) { //Console.WriteLine ("REMOVED " + author); item.labels.Remove("author"); } else { item.labels ["author"] = NameCase(author); //Put the formatting done above back into db string[] authSplit = author.Split(' '); string firstName = authSplit[0].ToLower(); if (titles.Contains(firstName) && authSplit.Length > 1) { if (authSplit.Length == 2) { //Just a last name. firstName = "a"; //Will be marked neutral. } else { firstName = authSplit[1]; } } if (neutralNames.Contains(firstName) || firstName.Length == 1) { //Gender unknown } else if (maleNames.Contains(firstName) || firstName.EndsWith("ndra")) { item.labels["gender"] = "male"; } else if (firstName[firstName.Length - 1] == 'a' || firstName.EndsWith("ee") || femaleNames.Contains(firstName)) { item.labels["gender"] = "female"; } else if ("eiou".Contains(firstName[firstName.Length - 1])) { //Gender unknown (suspected female) } else if (firstName.Length > 1) { item.labels["gender"] = "male"; } } item.labels ["filename"] = item.labels ["filename"].Replace("_", " ").RegexReplace("([#$&])", "\\$1"); if (item.labels.ContainsKey("location")) { item.labels ["location"] = item.labels ["location"].Replace("_", " ").RegexReplace("([#$&])", "\\$1"); item.labels ["location"] = manualLocationRenames.GetWithDefault(item.labels["location"], item.labels["location"]); item.labels ["location"] = NameCase(item.labels ["location"]); } } return(data); }
/* * public void TestNewClassifiers(){ * * } */ public static void TestLatex() { bool test = true; bool shorten = true; bool costarica = true; bool cuba = true; if (test) { costarica = cuba = false; } DiscreteSeriesDatabase <string> allData = LoadRegionsDatabase(test, shorten, costarica, cuba); /* * IFeatureSynthesizer<string> testSynth = new VarKmerFrequencyFeatureSynthesizer<string>("region", 3, 4, 50, 2.0, true); * testSynth.Train (allData); * * Console.WriteLine (testSynth.GetFeatureSchema().FoldToString ()); * Console.WriteLine (testSynth.SynthesizeFeaturesSumToOne(new DiscreteEventSeries<string>(allData.data.First ().labels, allData.data.First ().Take (25).ToArray ())).FoldToString (d => d.ToString ("F3"))); * Console.ReadLine (); */ /* * if(test){ * allData = allData.SplitDatabase (.25).Item1; * } */ //TODO: Add length distribution for documents and each type. //Create a feature synthesizer //IFeatureSynthesizer<string> synth = new RegressorFeatureSynthesizerKmerFrequenciesVarK<string>("region", 8, 2, 100, 3); //Slowld way //IFeatureSynthesizer<string> synth = new VarKmerFrequencyFeatureSynthesizer<string>("region", 3, 4, 50, 2.0, true); //IEventSeriesProbabalisticClassifier<string> textClassifier // = TextClassifierFactory.TextClassifier ("region", new[]{"region", "type"}); //string documentTitle, string author, int width, int height, string outFile, IEnumerable<Tuple<string, IEventSeriesProbabalisticClassifier<Ty>>> classifiers, string datasetTitle, DiscreteSeriesDatabase<Ty> dataset, string criterionByWhichToClassify //IEnumerable<Tuple<string, IEventSeriesProbabalisticClassifier<string>>> classifiers = TextClassifierFactory.RegionsTestClassifiers().ToArray (); IEnumerable <Tuple <string, IEventSeriesProbabalisticClassifier <string> > > classifiers = TextClassifierFactory.RegionsPerceptronTestClassifiers().ToArray(); IFeatureSynthesizer <string> synthesizer = new CompoundFeatureSynthesizer <string>( "region", new IFeatureSynthesizer <string>[] { new VarKmerFrequencyFeatureSynthesizerToRawFrequencies <string>("region", 2, 2, 16, .1, false), new LatinLanguageFeatureSynthesizer("region"), new VarKmerFrequencyFeatureSynthesizer <string>("region", 3, 4, 50, 2.0, false), new VarKmerFrequencyFeatureSynthesizer <string>("type", 3, 3, 50, 2.0, false) } ); if (test) { classifiers = classifiers.Take(2); } WriteupGenerator.ProduceClassifierComparisonWriteup <string>("Spanish Language Dialect Analysis", "Cyrus Cousins", 11, 16, "../../out/spanish/classification/", classifiers.ToArray(), "Spanish Language", allData, "region", test ? 1 : 4, analysisCriteria: new[] { "region", "type" }, synthesizer: synthesizer); /* * if (classifier is SeriesFeatureSynthesizerToVectorProbabalisticClassifierEventSeriesProbabalisticClassifier<string>) { * IFeatureSynthesizer<string> synthesizer = ((SeriesFeatureSynthesizerToVectorProbabalisticClassifierEventSeriesProbabalisticClassifier<string>)classifier).synthesizer; * * //doc.Append ("\\section{Feature Synthesizer Analysis}\n\n"); * //doc.Append (synthesizer.FeatureSynthesizerLatexString(allData)); * } */ }
//Load a database file from a stream. public static void LoadTextDatabase(this DiscreteSeriesDatabase <string> db, TextReader inStream, Func <string, string> textProcessor) { LoadTextDatabase(db, "", inStream, textProcessor); }
public static IFeatureSynthesizer <string> deriveOptimalClassifier() { //Load databases DiscreteSeriesDatabase <string> allData = LoadRegionsDatabase(); Tuple <DiscreteSeriesDatabase <string>, DiscreteSeriesDatabase <string> > split = allData.SplitDatabase(.8); DiscreteSeriesDatabase <string> trainingData = split.Item1; DiscreteSeriesDatabase <string> testData = split.Item2; string cat = "region"; double optimalScore = 0; IFeatureSynthesizer <string> optimalClassifier = null; string optimalInfoStr = null; //Preliminary scan int[] ks = new int[] { 2, 3, 4 }; //int[] minCutoffs = new int[]{5, 10, 20}; int[] minCutoffs = new int[] { 10 }; int[] kmerCounts = new int[] { 10, 25, 50, 100 }; int[] smoothingAmounts = new int[] { 1, 5, 10 }; string[] colNames = "k minCutoff kmerCount smoothingAmount score".Split(' '); Console.WriteLine(colNames.FoldToString("", "", ",")); foreach (int k in ks) { foreach (int minCutoff in minCutoffs) { foreach (int kmerCount in kmerCounts) { foreach (int smoothingAmount in smoothingAmounts) { IFeatureSynthesizer <string> classifier = new RegressorFeatureSynthesizerKmerFrequenciesVarK <string>(cat, minCutoff, smoothingAmount, kmerCount, k); classifier.Train(trainingData); double score = classifier.ScoreModel(testData); string infoStr = new double[] { k, minCutoff, kmerCount, smoothingAmount, score }.FoldToString("", "", ","); Console.WriteLine(infoStr); if (score > optimalScore) { optimalScore = score; optimalClassifier = classifier; optimalInfoStr = infoStr; } } } } } Console.WriteLine("Optimal Classifier:"); Console.WriteLine(optimalInfoStr); Console.WriteLine(optimalClassifier); return(optimalClassifier); }
public static DiscreteSeriesDatabase <string> LoadRegionsDatabase(bool test = false, bool shorten = false, bool costarica = true, bool cuba = true) { //Load training data and create classifier. string directory = "../../res/regiones/"; string[] regions = "españa argentina méxico colombia".Split(' '); string file = ""; if (costarica) { regions = "costarica".Cons(regions).ToArray(); } if (cuba) { regions = "cuba".Cons(regions).ToArray(); } //string[] prefixes = new[]{"", "literatura", "historia", "lengua"}; //file += prefixes.Select (prefix => regions.FoldToString ((sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "news" + " " + prefix + val, "", "", "\n")).FoldToString ("", "", "\n"); file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "news" + " " + val + "\n"); file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "wiki" + " " + "literatura" + val + "\n"); file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "wiki" + " " + "historia" + val + "\n"); file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "wiki" + " " + "lengua" + val + "\n"); file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "receta" + " " + "recetas" + val + "\n"); if (!test) { { string[] literatureRegions = "costarica costarica españa españa españa argentina argentina argentina argentina argentina argentina españa españa españa españa méxico méxico méxico méxico méxico méxico méxico colombia colombia colombia colombia colombia".Split(' '); string[] literatureNames = "leyendascr elisadelmar juanvaleraavuelaplumaespaña juanvaleraloscordobesesespaña marianela historiauniversal lamuerte buenosaires derroterosyviages fundaciondelaciudad laargentina mosenmillan historiadejudios viajosporespaña recuerdosybellezas leyendasmayas nahuatl laberinto comoaguaparachocolate mitoshorroresmexicanos leyendasmexicanas mitosurbanesmexicanos lamultituderrante viajoscolombianos leyendasurbanascolombianas mitoscolombianos mitoscolombianos2".Split(' '); IEnumerable <string> classesStrings = literatureRegions.Select(r => "region:" + r + ";" + "type:" + "literature"); file += classesStrings.Zip(literatureNames, (thisClasses, thisPath) => thisClasses + " " + thisPath).Aggregate(new StringBuilder(), (sum, val) => sum.Append(val).Append("\n")); } { string[] names = ( "salud antologia9 escorpionescr teca vacunoscr lanación universidadcr recetascostarica2 recetascostarica3 crcrawl presidentecostarica gobiernocostarica " + "arqueologiamaya poesiamexicana catolicismosocial unam mxcrawl cocrawl cocrawl2 desplazadoscolombianos mexicocnn méxicolgbt méxicogob historiaazteca historiaazteca2 " + "ordenamientoterretorrial competitividad ministerio" ).Split(' '); string[] tags = ( "region:costarica region:costarica region:costarica region:costarica region:costarica;type:paper region:costarica;type:news region:costarica region:costarica;type:receta region:costarica;type:receta region:costarica;type:website region:costarica;type:wiki region:costarica;type:wiki " + "region:méxico region:méxico;type:paper region:méxico;type:paper region:méxico;type:paper region:méxico;type:website region:colombia;type:website region:colombia;type:website region:colombia;type:wiki region:méxico;type:news region:méxico;type:brochure region:méxico;type:website region:méxico region:méxico " + "region:colombia region:colombia region:colombia" ).Split(' '); file += tags.Zip(names, (tag, name) => tag + " " + name).FoldToString("", "\n", "\n"); } } if (cuba) { file += "region:cuba;type:wiki cubaisla\n"; file += "region:cuba;type:receta recetascuba2\n"; file += "region:cuba;type:receta recetascuba3\n"; file += "region:cuba;type:literatura lahistoriame\n"; file += "region:cuba;type:literatura elencuentro\n"; } Console.WriteLine("Regions Database:"); Console.WriteLine(file); TextReader reader = new StringReader(file); DiscreteSeriesDatabase <string> d = new DiscreteSeriesDatabase <string> (); d.LoadTextDatabase(directory, reader, DatabaseLoader.ProcessSpanishText, 3); if (shorten) { d = new DiscreteSeriesDatabase <string>(d.Select(item => new DiscreteEventSeries <string>(item.labels, item.data.Take(750).ToArray()))); } return(d); }
//Construction: //Train an IFeatureSynthesizer model. public void Train(DiscreteSeriesDatabase <Ty> data) { regressors = CreateRegressors(data); }