예제 #1
0
        //Regression.
        public double RegressEventSeries(DiscreteEventSeries <A> series)
        {
            //It's probably faster to convert the series to a multiset before scoring, particularly when many things are repeated.
            Multiset <Kmer <A> > seriesMultiset = series.ToMultisetKmer(k);

            return(regressor.RegressEventSeries(seriesMultiset));
        }
예제 #2
0
        public static Multiset <Ty> ToMultiset <Ty>(this DiscreteEventSeries <Ty> series)
        {
            Multiset <Ty> multiset = new Multiset <Ty>();

            series.data.ForEach(a => multiset.Add(a));              //Doesn't use the above, as an optimization.
            return(multiset);
        }
예제 #3
0
        public static MultisetKmer <Ty> ToMultisetVarKmer <Ty>(this DiscreteEventSeries <Ty> series, int k)
        {
            MultisetKmer <Ty> multiset = new MultisetKmer <Ty>(k);

            multiset.AddDiscreteEventSeriesVarKmer(series, k);
            return(multiset);
        }
예제 #4
0
        public static string ClassifyItem <Ty>(IFeatureSynthesizer <Ty> synth, DiscreteEventSeries <Ty> item, string nameField)
        {
            double[] scores = synth.SynthesizeFeaturesSumToOne(item);

            double max = scores.Max();

            //TODO don't report ambiguous cases.
            return(item.labels[nameField] + ": " + synth.SynthesizeLabelFeature(item) + "" +
                   "(" + max + " confidence)");
        }
예제 #5
0
        //Regression.
        public double RegressEventSeries(DiscreteEventSeries <A> series)
        {
            double score = 0;

            for (int k = 1; k <= maxK; k++)             //For locality!
            {
                Multiset <Kmer <A> > ms = series.ToMultisetKmer(k);
                score += regressor.RegressEventSeries(ms);
            }
            return(score);
        }
예제 #6
0
        //Synthesize features for an item.
        public double[] SynthesizeFeatures(DiscreteEventSeries <string> item)
        {
            //"Word Count;Mean Sentence Length;Orthographical Error Rate;Formality;Textspeak"

            return(new[] {
                item.data.Length,
                item.data.Length / (double)item.data.Where(word => stops.Contains(word)).Count(),
                item.data.Where(word => englishSpellingErrors.Contains(word.ToLower())).Count() / (double)item.data.Length,
                item.data.Where(word => englishFormals.Contains(word.ToLower())).Count() / (double)item.data.Length,
                item.data.Where(word => textSpeak.Contains(word.ToLower())).Count() / (double)item.data.Length
            });
        }
예제 #7
0
        //TODO .NET 4.5 only.

        /*
         * public static DiscreteEventSeries<string> processEntryFromZipArchive (ZipArchive archive, Dictionary<string, string> tags, string entry, int logLevel, Func<string, string> textProcessor){
         *
         *      //Load the file as a set of words
         *      string filePath = directory + tags["filename"];
         *
         *      //Add the file as an entry to the database.
         *      using (StreamReader sr = File.OpenText(filePath)) {
         *              return loadEntry (tags, sr, logLevel, textProcessor);
         *      }
         * }
         */

        public static DiscreteEventSeries <string> loadEntry(Dictionary <string, string> tags, StreamReader streamReader, int logLevel, Func <string, string> textProcessor)
        {
            string[] words = DatabaseLoader.loadWordFileRaw(streamReader, textProcessor);
            DiscreteEventSeries <string> file = new DiscreteEventSeries <string>(tags, words);

            if (logLevel >= 2)
            {
                Console.WriteLine("Read " + tags.FoldToString(item => item.Key + ":" + item.Value) + ": " + words.Length + " words.");
            }

            return(file);
        }
예제 #8
0
        //Synthesize features for an item.
        public double[] SynthesizeFeatures(DiscreteEventSeries <string> item)
        {
            double wordCount          = item.data.Length;
            double meanWordLength     = item.data.Select(word => word.Length).Average();
            double stdevWordLength    = item.data.Select(word => (double)word.Length).Stdev(meanWordLength);
            double meanSentenceLength = item.data.Length / (double)item.data.Where(word => stops.Contains(word)).Count();             //TODO: Stdev sentence length would be nice.

            return(new[] {
                wordCount,
                meanWordLength,
                stdevWordLength,
                meanSentenceLength
            });
        }
예제 #9
0
        /*
         * private IEnumerable<TupleStruct<Kmer<Ty>, double>> ExtractUncharacteristicKmersForClass (int classIndex, MultisetKmer<Ty> thisClass, MultisetKmer<Ty> baseline)
         * {
         *      //This will only work with an enormous amount of data for low k.
         * }
         */


        //Calculation:

        //Synthesize features for an item.
        //TODO: Enforce contract
        public double[] SynthesizeFeatures(DiscreteEventSeries <Ty> item)
        {
            double[] vals = new double[kmerCount];

            MultisetKmer <Ty> ms = item.ToMultisetVarKmer <Ty>(k);

            foreach (KeyValuePair <Kmer <Ty>, int> kvp in ms)
            {
                int index = 0;
                if (kmersOntoIndex.TryGetValue(kvp.Key, out index))
                {
                    vals[index] = kvp.Value / (double)ms.Size((int)kvp.Key.Count);
                }
            }

            return(vals);
        }
예제 #10
0
        //Synthesize features for an item.
        public double[] SynthesizeFeatures(DiscreteEventSeries <string> item)
        {
            //"Word Count;Mean Sentence Length;Orthographical Error Rate;Formality;Textspeak"
            string date;

            if (item.labels.TryGetValue(ClassificationCriterion, out date))
            {
                try{
                    int[] split = date.Split('-').Select(term => Int32.Parse(term)).ToArray();
                    return(new[] {
                        (new DateTime(split[0], split[1], split[2]).Ticks - new DateTime(2000, 1, 1).Ticks) / (10000000.0 * 60 * 60 * 24 * 365.25)
                    });                    //ticks are 100 ns.  This converts to years since 2000.  Leap years are only partially handled.
                }
                catch (Exception e) {
                    //TODO: respond to this error.
                }
            }
            return(new[] { 0.0 });         //TODO: NaN?  Other no information representation?
        }
예제 #11
0
        /*
         * private IEnumerable<TupleStruct<Kmer<Ty>, double>> ExtractUncharacteristicKmersForClass (int classIndex, MultisetKmer<Ty> thisClass, MultisetKmer<Ty> baseline)
         * {
         *      //This will only work with an enormous amount of data for low k.
         * }
         */


        //Calculation:

        //Synthesize features for an item.
        //TODO: Enforce contract
        public double[] SynthesizeFeatures(DiscreteEventSeries <Ty> item)
        {
            double[] vals = new double[classCount];

            MultisetKmer <Ty> ms = item.ToMultisetVarKmer <Ty>(k);

            foreach (KeyValuePair <Kmer <Ty>, int> kvp in ms)
            {
                Dictionary <int, double> classesWithKvp;
                if (learnedCharacteristicKmers.TryGetValue(kvp.Key, out classesWithKvp))
                {
                    //Console.WriteLine ("\tFound kmer " + kvp.Key + ".");
                    foreach (KeyValuePair <int, double> @class in classesWithKvp)
                    {
                        //Console.WriteLine ("\t\tClass " + @class.Key + ", Value " + @class.Value + ", Times " + kvp.Value);
                        vals[@class.Key] += kvp.Value * @class.Value;
                    }
                }
            }

            return(vals);
        }
예제 #12
0
        //Calculation:

        //Synthesize features for an item.
        //TODO: Enforce contract
        public double[] SynthesizeFeatures(DiscreteEventSeries <Ty> item)
        {
            return(synths.SelectMany(synth => synth.SynthesizeFeatures(item)).ToArray());
        }
예제 #13
0
 //
 //Kmer variadic k multiset
 //
 public static void AddDiscreteEventSeriesVarKmer <Ty>(this MultisetKmer <Ty> multiset, DiscreteEventSeries <Ty> series, int k)
 {
     Ty[] arr = series.data;
     multiset.ConsumeEventSeriesKmer(arr);
 }
예제 #14
0
 //
 //Kmer fixed k multiset
 //
 public static void AddDiscreteEventSeriesKmer <Ty>(this Multiset <Kmer <Ty> > multiset, DiscreteEventSeries <Ty> series, int k)
 {
     //Ty[] arr = series.data.ToArray();
     Ty[] arr = series.data;             //TODO: This is a decision.
     for (int i = 0; i <= arr.Length - k; i++)
     {
         multiset.Add(new Kmer <Ty>(arr, i, k));
     }
 }
예제 #15
0
        //Regression

        public double RegressEventSeries(DiscreteEventSeries <A> series)
        {
            //It's probably faster to convert the series to a multiset before scoring, particularly when many things are repeated.
            //So we do.
            return(RegressEventSeries(series.ToMultiset()));
        }
예제 #16
0
 //
 //Raw multiset:
 //
 public static void AddDiscreteEventSeries <Ty>(this Multiset <Ty> multiset, DiscreteEventSeries <Ty> series)
 {
     //TODO: Is it faster to make the set into a multiset, and then add the counts, so there are fewer lookups in the bigger multiset?
     series.data.ForEach(a => multiset.Add(a));
 }
예제 #17
0
 public static string SynthesizeLabelFeature <Ty>(this IFeatureSynthesizer <Ty> synth, DiscreteEventSeries <Ty> item)
 {
     return(synth.GetFeatureSchema()[synth.SynthesizeFeatures(item).MaxIndex()]);
 }
예제 #18
0
 public static double[] SynthesizeFeaturesSumToOne <Ty>(this IFeatureSynthesizer <Ty> synth, DiscreteEventSeries <Ty> item)
 {
     double[] vals = synth.SynthesizeFeatures(item).NormalizeSumInPlace();
     //It can happen that all are 0, in which case NaN results.
     if (Double.IsNaN(vals[0]))
     {
         //TODO Higher order function for this!
         for (int i = 0; i < vals.Length; i++)
         {
             vals[i] = 1.0 / vals.Length;
         }
     }
     return(vals);
 }
예제 #19
0
        //Name, true class, predicted class, scores, winning score;
        public static Tuple <string, string, string, double[], double> classificationInfo(IEventSeriesProbabalisticClassifier <Ty> classifier, string[] classifierSchema, Dictionary <string, int> trueSchemaMapping, DiscreteEventSeries <Ty> data, string nameCriterion, string criterionByWhichToClassify)
        {
            //scores in the synthesizer scorespace
            double[] synthScores = classifier.Classify(data);

            int maxIndex = synthScores.MaxIndex();

            /*
             * classifierSchema = classifier.GetClasses();
             * if (maxIndex >= classifierSchema.Length) {
             *      Console.WriteLine ("Schema not long enough.  synthlen, max, schema = " + synthScores.Length + ", " + maxIndex + ", " + classifierSchema.Length);
             *      Console.WriteLine ("Classifier Info:");
             *      Console.WriteLine (classifier.ToString ());
             *      Console.WriteLine ("Synth Features:");
             *      Console.WriteLine (((SeriesFeatureSynthesizerToVectorProbabalisticClassifierEventSeriesProbabalisticClassifier<string>)classifier).synthesizer.GetFeatureSchema().FoldToString ());
             *      Console.WriteLine ("Classifier Features:");
             *      Console.WriteLine (((SeriesFeatureSynthesizerToVectorProbabalisticClassifierEventSeriesProbabalisticClassifier<string>)classifier).classifier.GetClasses().FoldToString ());
             *      return null;
             * }
             */

            string predictedClass = classifierSchema[maxIndex];
            double maxScore       = synthScores[maxIndex];

            //convert scores to the true space.
            double[] trueScores = new double[trueSchemaMapping.Count];
            for (int j = 0; j < classifierSchema.Length; j++)
            {
                trueScores[trueSchemaMapping[classifierSchema[j]]] = synthScores[j];
            }

            return(new Tuple <string, string, string, double[], double> (data.labels[nameCriterion], data.labels[criterionByWhichToClassify], predictedClass, trueScores, maxScore));
        }
예제 #20
0
        //Helper, score a model on a single element
        private static double ScoreModelSingle <Ty>(this IFeatureSynthesizer <Ty> synth, Dictionary <string, int> classRanks, DiscreteEventSeries <Ty> item, int verbosity, string nameCategory = null)
        {
            int correctClass;

            if (!classRanks.TryGetValue(item.labels [synth.ClassificationCriterion], out correctClass))
            {
                if (verbosity >= 1)
                {
                    Console.WriteLine("Classifier does not contain data for " + item.labels [synth.ClassificationCriterion] + ".  Skipping this item.");
                }
                return(-1);
            }

            double[] scores = synth.SynthesizeFeaturesSumToOne(item);

            if (verbosity >= 2)
            {
                string toPrint;
                if (nameCategory != null)
                {
                    toPrint = item.labels[nameCategory] + " (" + item.labels [synth.ClassificationCriterion] + ")";
                }
                else
                {
                    toPrint = item.labels [synth.ClassificationCriterion];
                }
                toPrint += ": " + scores.FoldToString() + " (" + scores [correctClass] + ")";
                Console.WriteLine(toPrint);
            }

            return(scores [correctClass]);
        }
예제 #21
0
        //Calculation:

        //Synthesize features for an item.
        //TODO: Enforce contract
        public double[] SynthesizeFeatures(DiscreteEventSeries <Ty> item)
        {
            return(regressors.Map(r => r.RegressEventSeries(item)).ToArray());
        }
예제 #22
0
 public static void ConsumeEventSeries <Ty>(this EventSeriesConsumer <Ty> consumer, DiscreteEventSeries <Ty> series)
 {
     consumer.ConsumeEventSeries(series.data);
 }
예제 #23
0
 public double[] Classify(DiscreteEventSeries <Ty> series)
 {
     return(classifier.Classify(synthesizer.SynthesizeFeatures(series)));
 }