//Regression. public double RegressEventSeries(DiscreteEventSeries <A> series) { //It's probably faster to convert the series to a multiset before scoring, particularly when many things are repeated. Multiset <Kmer <A> > seriesMultiset = series.ToMultisetKmer(k); return(regressor.RegressEventSeries(seriesMultiset)); }
public static Multiset <Ty> ToMultiset <Ty>(this DiscreteEventSeries <Ty> series) { Multiset <Ty> multiset = new Multiset <Ty>(); series.data.ForEach(a => multiset.Add(a)); //Doesn't use the above, as an optimization. return(multiset); }
public static MultisetKmer <Ty> ToMultisetVarKmer <Ty>(this DiscreteEventSeries <Ty> series, int k) { MultisetKmer <Ty> multiset = new MultisetKmer <Ty>(k); multiset.AddDiscreteEventSeriesVarKmer(series, k); return(multiset); }
public static string ClassifyItem <Ty>(IFeatureSynthesizer <Ty> synth, DiscreteEventSeries <Ty> item, string nameField) { double[] scores = synth.SynthesizeFeaturesSumToOne(item); double max = scores.Max(); //TODO don't report ambiguous cases. return(item.labels[nameField] + ": " + synth.SynthesizeLabelFeature(item) + "" + "(" + max + " confidence)"); }
//Regression. public double RegressEventSeries(DiscreteEventSeries <A> series) { double score = 0; for (int k = 1; k <= maxK; k++) //For locality! { Multiset <Kmer <A> > ms = series.ToMultisetKmer(k); score += regressor.RegressEventSeries(ms); } return(score); }
//Synthesize features for an item. public double[] SynthesizeFeatures(DiscreteEventSeries <string> item) { //"Word Count;Mean Sentence Length;Orthographical Error Rate;Formality;Textspeak" return(new[] { item.data.Length, item.data.Length / (double)item.data.Where(word => stops.Contains(word)).Count(), item.data.Where(word => englishSpellingErrors.Contains(word.ToLower())).Count() / (double)item.data.Length, item.data.Where(word => englishFormals.Contains(word.ToLower())).Count() / (double)item.data.Length, item.data.Where(word => textSpeak.Contains(word.ToLower())).Count() / (double)item.data.Length }); }
//TODO .NET 4.5 only. /* * public static DiscreteEventSeries<string> processEntryFromZipArchive (ZipArchive archive, Dictionary<string, string> tags, string entry, int logLevel, Func<string, string> textProcessor){ * * //Load the file as a set of words * string filePath = directory + tags["filename"]; * * //Add the file as an entry to the database. * using (StreamReader sr = File.OpenText(filePath)) { * return loadEntry (tags, sr, logLevel, textProcessor); * } * } */ public static DiscreteEventSeries <string> loadEntry(Dictionary <string, string> tags, StreamReader streamReader, int logLevel, Func <string, string> textProcessor) { string[] words = DatabaseLoader.loadWordFileRaw(streamReader, textProcessor); DiscreteEventSeries <string> file = new DiscreteEventSeries <string>(tags, words); if (logLevel >= 2) { Console.WriteLine("Read " + tags.FoldToString(item => item.Key + ":" + item.Value) + ": " + words.Length + " words."); } return(file); }
//Synthesize features for an item. public double[] SynthesizeFeatures(DiscreteEventSeries <string> item) { double wordCount = item.data.Length; double meanWordLength = item.data.Select(word => word.Length).Average(); double stdevWordLength = item.data.Select(word => (double)word.Length).Stdev(meanWordLength); double meanSentenceLength = item.data.Length / (double)item.data.Where(word => stops.Contains(word)).Count(); //TODO: Stdev sentence length would be nice. return(new[] { wordCount, meanWordLength, stdevWordLength, meanSentenceLength }); }
/* * private IEnumerable<TupleStruct<Kmer<Ty>, double>> ExtractUncharacteristicKmersForClass (int classIndex, MultisetKmer<Ty> thisClass, MultisetKmer<Ty> baseline) * { * //This will only work with an enormous amount of data for low k. * } */ //Calculation: //Synthesize features for an item. //TODO: Enforce contract public double[] SynthesizeFeatures(DiscreteEventSeries <Ty> item) { double[] vals = new double[kmerCount]; MultisetKmer <Ty> ms = item.ToMultisetVarKmer <Ty>(k); foreach (KeyValuePair <Kmer <Ty>, int> kvp in ms) { int index = 0; if (kmersOntoIndex.TryGetValue(kvp.Key, out index)) { vals[index] = kvp.Value / (double)ms.Size((int)kvp.Key.Count); } } return(vals); }
//Synthesize features for an item. public double[] SynthesizeFeatures(DiscreteEventSeries <string> item) { //"Word Count;Mean Sentence Length;Orthographical Error Rate;Formality;Textspeak" string date; if (item.labels.TryGetValue(ClassificationCriterion, out date)) { try{ int[] split = date.Split('-').Select(term => Int32.Parse(term)).ToArray(); return(new[] { (new DateTime(split[0], split[1], split[2]).Ticks - new DateTime(2000, 1, 1).Ticks) / (10000000.0 * 60 * 60 * 24 * 365.25) }); //ticks are 100 ns. This converts to years since 2000. Leap years are only partially handled. } catch (Exception e) { //TODO: respond to this error. } } return(new[] { 0.0 }); //TODO: NaN? Other no information representation? }
/* * private IEnumerable<TupleStruct<Kmer<Ty>, double>> ExtractUncharacteristicKmersForClass (int classIndex, MultisetKmer<Ty> thisClass, MultisetKmer<Ty> baseline) * { * //This will only work with an enormous amount of data for low k. * } */ //Calculation: //Synthesize features for an item. //TODO: Enforce contract public double[] SynthesizeFeatures(DiscreteEventSeries <Ty> item) { double[] vals = new double[classCount]; MultisetKmer <Ty> ms = item.ToMultisetVarKmer <Ty>(k); foreach (KeyValuePair <Kmer <Ty>, int> kvp in ms) { Dictionary <int, double> classesWithKvp; if (learnedCharacteristicKmers.TryGetValue(kvp.Key, out classesWithKvp)) { //Console.WriteLine ("\tFound kmer " + kvp.Key + "."); foreach (KeyValuePair <int, double> @class in classesWithKvp) { //Console.WriteLine ("\t\tClass " + @class.Key + ", Value " + @class.Value + ", Times " + kvp.Value); vals[@class.Key] += kvp.Value * @class.Value; } } } return(vals); }
//Calculation: //Synthesize features for an item. //TODO: Enforce contract public double[] SynthesizeFeatures(DiscreteEventSeries <Ty> item) { return(synths.SelectMany(synth => synth.SynthesizeFeatures(item)).ToArray()); }
// //Kmer variadic k multiset // public static void AddDiscreteEventSeriesVarKmer <Ty>(this MultisetKmer <Ty> multiset, DiscreteEventSeries <Ty> series, int k) { Ty[] arr = series.data; multiset.ConsumeEventSeriesKmer(arr); }
// //Kmer fixed k multiset // public static void AddDiscreteEventSeriesKmer <Ty>(this Multiset <Kmer <Ty> > multiset, DiscreteEventSeries <Ty> series, int k) { //Ty[] arr = series.data.ToArray(); Ty[] arr = series.data; //TODO: This is a decision. for (int i = 0; i <= arr.Length - k; i++) { multiset.Add(new Kmer <Ty>(arr, i, k)); } }
//Regression public double RegressEventSeries(DiscreteEventSeries <A> series) { //It's probably faster to convert the series to a multiset before scoring, particularly when many things are repeated. //So we do. return(RegressEventSeries(series.ToMultiset())); }
// //Raw multiset: // public static void AddDiscreteEventSeries <Ty>(this Multiset <Ty> multiset, DiscreteEventSeries <Ty> series) { //TODO: Is it faster to make the set into a multiset, and then add the counts, so there are fewer lookups in the bigger multiset? series.data.ForEach(a => multiset.Add(a)); }
public static string SynthesizeLabelFeature <Ty>(this IFeatureSynthesizer <Ty> synth, DiscreteEventSeries <Ty> item) { return(synth.GetFeatureSchema()[synth.SynthesizeFeatures(item).MaxIndex()]); }
public static double[] SynthesizeFeaturesSumToOne <Ty>(this IFeatureSynthesizer <Ty> synth, DiscreteEventSeries <Ty> item) { double[] vals = synth.SynthesizeFeatures(item).NormalizeSumInPlace(); //It can happen that all are 0, in which case NaN results. if (Double.IsNaN(vals[0])) { //TODO Higher order function for this! for (int i = 0; i < vals.Length; i++) { vals[i] = 1.0 / vals.Length; } } return(vals); }
//Name, true class, predicted class, scores, winning score; public static Tuple <string, string, string, double[], double> classificationInfo(IEventSeriesProbabalisticClassifier <Ty> classifier, string[] classifierSchema, Dictionary <string, int> trueSchemaMapping, DiscreteEventSeries <Ty> data, string nameCriterion, string criterionByWhichToClassify) { //scores in the synthesizer scorespace double[] synthScores = classifier.Classify(data); int maxIndex = synthScores.MaxIndex(); /* * classifierSchema = classifier.GetClasses(); * if (maxIndex >= classifierSchema.Length) { * Console.WriteLine ("Schema not long enough. synthlen, max, schema = " + synthScores.Length + ", " + maxIndex + ", " + classifierSchema.Length); * Console.WriteLine ("Classifier Info:"); * Console.WriteLine (classifier.ToString ()); * Console.WriteLine ("Synth Features:"); * Console.WriteLine (((SeriesFeatureSynthesizerToVectorProbabalisticClassifierEventSeriesProbabalisticClassifier<string>)classifier).synthesizer.GetFeatureSchema().FoldToString ()); * Console.WriteLine ("Classifier Features:"); * Console.WriteLine (((SeriesFeatureSynthesizerToVectorProbabalisticClassifierEventSeriesProbabalisticClassifier<string>)classifier).classifier.GetClasses().FoldToString ()); * return null; * } */ string predictedClass = classifierSchema[maxIndex]; double maxScore = synthScores[maxIndex]; //convert scores to the true space. double[] trueScores = new double[trueSchemaMapping.Count]; for (int j = 0; j < classifierSchema.Length; j++) { trueScores[trueSchemaMapping[classifierSchema[j]]] = synthScores[j]; } return(new Tuple <string, string, string, double[], double> (data.labels[nameCriterion], data.labels[criterionByWhichToClassify], predictedClass, trueScores, maxScore)); }
//Helper, score a model on a single element private static double ScoreModelSingle <Ty>(this IFeatureSynthesizer <Ty> synth, Dictionary <string, int> classRanks, DiscreteEventSeries <Ty> item, int verbosity, string nameCategory = null) { int correctClass; if (!classRanks.TryGetValue(item.labels [synth.ClassificationCriterion], out correctClass)) { if (verbosity >= 1) { Console.WriteLine("Classifier does not contain data for " + item.labels [synth.ClassificationCriterion] + ". Skipping this item."); } return(-1); } double[] scores = synth.SynthesizeFeaturesSumToOne(item); if (verbosity >= 2) { string toPrint; if (nameCategory != null) { toPrint = item.labels[nameCategory] + " (" + item.labels [synth.ClassificationCriterion] + ")"; } else { toPrint = item.labels [synth.ClassificationCriterion]; } toPrint += ": " + scores.FoldToString() + " (" + scores [correctClass] + ")"; Console.WriteLine(toPrint); } return(scores [correctClass]); }
//Calculation: //Synthesize features for an item. //TODO: Enforce contract public double[] SynthesizeFeatures(DiscreteEventSeries <Ty> item) { return(regressors.Map(r => r.RegressEventSeries(item)).ToArray()); }
public static void ConsumeEventSeries <Ty>(this EventSeriesConsumer <Ty> consumer, DiscreteEventSeries <Ty> series) { consumer.ConsumeEventSeries(series.data); }
public double[] Classify(DiscreteEventSeries <Ty> series) { return(classifier.Classify(synthesizer.SynthesizeFeatures(series))); }