示例#1
0
        protected internal virtual GeneralDataset <string, string> CreateDataset(Annotation corpus)
        {
            GeneralDataset <string, string> dataset = new RVFDataset <string, string>();

            foreach (ICoreMap sentence in corpus.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                foreach (RelationMention rel in AnnotationUtils.GetAllRelations(relationMentionFactory, sentence, createUnrelatedRelations))
                {
                    dataset.Add(CreateDatum(rel));
                }
            }
            dataset.ApplyFeatureCountThreshold(featureCountThreshold);
            return(dataset);
        }
示例#2
0
        /// <summary>Train a sentiment model from a set of data.</summary>
        /// <param name="data">The data to train the model from.</param>
        /// <param name="modelLocation">
        /// An optional location to save the model.
        /// Note that this stream will be closed in this method,
        /// and should not be written to thereafter.
        /// </param>
        /// <returns>A sentiment classifier, ready to use.</returns>
        public static SimpleSentiment Train(IStream <SimpleSentiment.SentimentDatum> data, Optional <OutputStream> modelLocation)
        {
            // Some useful variables configuring how we train
            bool   useL1 = true;
            double sigma = 1.0;
            int    featureCountThreshold = 5;

            // Featurize the data
            Redwood.Util.ForceTrack("Featurizing");
            RVFDataset <SentimentClass, string> dataset = new RVFDataset <SentimentClass, string>();
            AtomicInteger             datasize          = new AtomicInteger(0);
            ICounter <SentimentClass> distribution      = new ClassicCounter <SentimentClass>();

            data.Unordered().Parallel().Map(null).ForEach(null);
            Redwood.Util.EndTrack("Featurizing");
            // Print label distribution
            Redwood.Util.StartTrack("Distribution");
            foreach (SentimentClass label in SentimentClass.Values())
            {
                Redwood.Util.Log(string.Format("%7d", (int)distribution.GetCount(label)) + "   " + label);
            }
            Redwood.Util.EndTrack("Distribution");
            // Train the classifier
            Redwood.Util.ForceTrack("Training");
            if (featureCountThreshold > 1)
            {
                dataset.ApplyFeatureCountThreshold(featureCountThreshold);
            }
            dataset.Randomize(42L);
            LinearClassifierFactory <SentimentClass, string> factory = new LinearClassifierFactory <SentimentClass, string>();

            factory.SetVerbose(true);
            try
            {
                factory.SetMinimizerCreator(null);
            }
            catch (Exception)
            {
            }
            factory.SetSigma(sigma);
            LinearClassifier <SentimentClass, string> classifier = factory.TrainClassifier(dataset);

            // Optionally save the model
            modelLocation.IfPresent(null);
            Redwood.Util.EndTrack("Training");
            // Evaluate the model
            Redwood.Util.ForceTrack("Evaluating");
            factory.SetVerbose(false);
            double sumAccuracy             = 0.0;
            ICounter <SentimentClass> sumP = new ClassicCounter <SentimentClass>();
            ICounter <SentimentClass> sumR = new ClassicCounter <SentimentClass>();
            int numFolds = 4;

            for (int fold = 0; fold < numFolds; ++fold)
            {
                Pair <GeneralDataset <SentimentClass, string>, GeneralDataset <SentimentClass, string> > trainTest = dataset.SplitOutFold(fold, numFolds);
                LinearClassifier <SentimentClass, string> foldClassifier = factory.TrainClassifierWithInitialWeights(trainTest.first, classifier);
                // convex objective, so this should be OK
                sumAccuracy += foldClassifier.EvaluateAccuracy(trainTest.second);
                foreach (SentimentClass label_1 in SentimentClass.Values())
                {
                    Pair <double, double> pr = foldClassifier.EvaluatePrecisionAndRecall(trainTest.second, label_1);
                    sumP.IncrementCount(label_1, pr.first);
                    sumP.IncrementCount(label_1, pr.second);
                }
            }
            DecimalFormat df = new DecimalFormat("0.000%");

            log.Info("----------");
            double aveAccuracy = sumAccuracy / ((double)numFolds);

            log.Info(string.Empty + numFolds + "-fold accuracy: " + df.Format(aveAccuracy));
            log.Info(string.Empty);
            foreach (SentimentClass label_2 in SentimentClass.Values())
            {
                double p = sumP.GetCount(label_2) / numFolds;
                double r = sumR.GetCount(label_2) / numFolds;
                log.Info(label_2 + " (P)  = " + df.Format(p));
                log.Info(label_2 + " (R)  = " + df.Format(r));
                log.Info(label_2 + " (F1) = " + df.Format(2 * p * r / (p + r)));
                log.Info(string.Empty);
            }
            log.Info("----------");
            Redwood.Util.EndTrack("Evaluating");
            // Return
            return(new SimpleSentiment(classifier));
        }