protected internal virtual GeneralDataset <string, string> CreateDataset(Annotation corpus) { GeneralDataset <string, string> dataset = new RVFDataset <string, string>(); foreach (ICoreMap sentence in corpus.Get(typeof(CoreAnnotations.SentencesAnnotation))) { foreach (RelationMention rel in AnnotationUtils.GetAllRelations(relationMentionFactory, sentence, createUnrelatedRelations)) { dataset.Add(CreateDatum(rel)); } } dataset.ApplyFeatureCountThreshold(featureCountThreshold); return(dataset); }
/// <summary>Train a sentiment model from a set of data.</summary> /// <param name="data">The data to train the model from.</param> /// <param name="modelLocation"> /// An optional location to save the model. /// Note that this stream will be closed in this method, /// and should not be written to thereafter. /// </param> /// <returns>A sentiment classifier, ready to use.</returns> public static SimpleSentiment Train(IStream <SimpleSentiment.SentimentDatum> data, Optional <OutputStream> modelLocation) { // Some useful variables configuring how we train bool useL1 = true; double sigma = 1.0; int featureCountThreshold = 5; // Featurize the data Redwood.Util.ForceTrack("Featurizing"); RVFDataset <SentimentClass, string> dataset = new RVFDataset <SentimentClass, string>(); AtomicInteger datasize = new AtomicInteger(0); ICounter <SentimentClass> distribution = new ClassicCounter <SentimentClass>(); data.Unordered().Parallel().Map(null).ForEach(null); Redwood.Util.EndTrack("Featurizing"); // Print label distribution Redwood.Util.StartTrack("Distribution"); foreach (SentimentClass label in SentimentClass.Values()) { Redwood.Util.Log(string.Format("%7d", (int)distribution.GetCount(label)) + " " + label); } Redwood.Util.EndTrack("Distribution"); // Train the classifier Redwood.Util.ForceTrack("Training"); if (featureCountThreshold > 1) { dataset.ApplyFeatureCountThreshold(featureCountThreshold); } dataset.Randomize(42L); LinearClassifierFactory <SentimentClass, string> factory = new LinearClassifierFactory <SentimentClass, string>(); factory.SetVerbose(true); try { factory.SetMinimizerCreator(null); } catch (Exception) { } factory.SetSigma(sigma); LinearClassifier <SentimentClass, string> classifier = factory.TrainClassifier(dataset); // Optionally save the model modelLocation.IfPresent(null); Redwood.Util.EndTrack("Training"); // Evaluate the model Redwood.Util.ForceTrack("Evaluating"); factory.SetVerbose(false); double sumAccuracy = 0.0; ICounter <SentimentClass> sumP = new ClassicCounter <SentimentClass>(); ICounter <SentimentClass> sumR = new ClassicCounter <SentimentClass>(); int numFolds = 4; for (int fold = 0; fold < numFolds; ++fold) { Pair <GeneralDataset <SentimentClass, string>, GeneralDataset <SentimentClass, string> > trainTest = dataset.SplitOutFold(fold, numFolds); LinearClassifier <SentimentClass, string> foldClassifier = factory.TrainClassifierWithInitialWeights(trainTest.first, classifier); // convex objective, so this should be OK sumAccuracy += foldClassifier.EvaluateAccuracy(trainTest.second); foreach (SentimentClass label_1 in SentimentClass.Values()) { Pair <double, double> pr = foldClassifier.EvaluatePrecisionAndRecall(trainTest.second, label_1); sumP.IncrementCount(label_1, pr.first); sumP.IncrementCount(label_1, pr.second); } } DecimalFormat df = new DecimalFormat("0.000%"); log.Info("----------"); double aveAccuracy = sumAccuracy / ((double)numFolds); log.Info(string.Empty + numFolds + "-fold accuracy: " + df.Format(aveAccuracy)); log.Info(string.Empty); foreach (SentimentClass label_2 in SentimentClass.Values()) { double p = sumP.GetCount(label_2) / numFolds; double r = sumR.GetCount(label_2) / numFolds; log.Info(label_2 + " (P) = " + df.Format(p)); log.Info(label_2 + " (R) = " + df.Format(r)); log.Info(label_2 + " (F1) = " + df.Format(2 * p * r / (p + r))); log.Info(string.Empty); } log.Info("----------"); Redwood.Util.EndTrack("Evaluating"); // Return return(new SimpleSentiment(classifier)); }