public static void TestDataset() { Dataset <string, string> data = new Dataset <string, string>(); data.Add(new BasicDatum <string, string>(Arrays.AsList(new string[] { "fever", "cough", "congestion" }), "cold")); data.Add(new BasicDatum <string, string>(Arrays.AsList(new string[] { "fever", "cough", "nausea" }), "flu")); data.Add(new BasicDatum <string, string>(Arrays.AsList(new string[] { "cough", "congestion" }), "cold")); // data.summaryStatistics(); NUnit.Framework.Assert.AreEqual(4, data.NumFeatures()); NUnit.Framework.Assert.AreEqual(4, data.NumFeatureTypes()); NUnit.Framework.Assert.AreEqual(2, data.NumClasses()); NUnit.Framework.Assert.AreEqual(8, data.NumFeatureTokens()); NUnit.Framework.Assert.AreEqual(3, data.Size()); data.ApplyFeatureCountThreshold(2); NUnit.Framework.Assert.AreEqual(3, data.NumFeatures()); NUnit.Framework.Assert.AreEqual(3, data.NumFeatureTypes()); NUnit.Framework.Assert.AreEqual(2, data.NumClasses()); NUnit.Framework.Assert.AreEqual(7, data.NumFeatureTokens()); NUnit.Framework.Assert.AreEqual(3, data.Size()); //Dataset data = Dataset.readSVMLightFormat(args[0]); //double[] scores = data.getInformationGains(); //System.out.println(ArrayMath.mean(scores)); //System.out.println(ArrayMath.variance(scores)); LinearClassifierFactory <string, string> factory = new LinearClassifierFactory <string, string>(); LinearClassifier <string, string> classifier = factory.TrainClassifier(data); IDatum <string, string> d = new BasicDatum <string, string>(Arrays.AsList(new string[] { "cough", "fever" })); NUnit.Framework.Assert.AreEqual("Classification incorrect", "flu", classifier.ClassOf(d)); ICounter <string> probs = classifier.ProbabilityOf(d); NUnit.Framework.Assert.AreEqual("Returned probability incorrect", 0.4553, probs.GetCount("cold"), 0.0001); NUnit.Framework.Assert.AreEqual("Returned probability incorrect", 0.5447, probs.GetCount("flu"), 0.0001); System.Console.Out.WriteLine(); }
public static void Main(string[] args) { // Create a training set IList <IDatum <string, string> > trainingData = new List <IDatum <string, string> >(); trainingData.Add(MakeStopLights(Green, Red)); trainingData.Add(MakeStopLights(Green, Red)); trainingData.Add(MakeStopLights(Green, Red)); trainingData.Add(MakeStopLights(Red, Green)); trainingData.Add(MakeStopLights(Red, Green)); trainingData.Add(MakeStopLights(Red, Green)); trainingData.Add(MakeStopLights(Red, Red)); // Create a test set IDatum <string, string> workingLights = MakeStopLights(Green, Red); IDatum <string, string> brokenLights = MakeStopLights(Red, Red); // Build a classifier factory LinearClassifierFactory <string, string> factory = new LinearClassifierFactory <string, string>(); factory.UseConjugateGradientAscent(); // Turn on per-iteration convergence updates factory.SetVerbose(true); //Small amount of smoothing factory.SetSigma(10.0); // Build a classifier LinearClassifier <string, string> classifier = factory.TrainClassifier(trainingData); // Check out the learned weights classifier.Dump(); // Test the classifier System.Console.Out.WriteLine("Working instance got: " + classifier.ClassOf(workingLights)); classifier.JustificationOf(workingLights); System.Console.Out.WriteLine("Broken instance got: " + classifier.ClassOf(brokenLights)); classifier.JustificationOf(brokenLights); }
public virtual void TrainMulticlass(GeneralDataset <string, string> trainSet) { if (Sharpen.Runtime.EqualsIgnoreCase(relationExtractorClassifierType, "linear")) { LinearClassifierFactory <string, string> lcFactory = new LinearClassifierFactory <string, string>(1e-4, false, sigma); lcFactory.SetVerbose(false); // use in-place SGD instead of QN. this is faster but much worse! // lcFactory.useInPlaceStochasticGradientDescent(-1, -1, 1.0); // use a hybrid minimizer: start with in-place SGD, continue with QN // lcFactory.useHybridMinimizerWithInPlaceSGD(50, -1, sigma); classifier = lcFactory.TrainClassifier(trainSet); } else { if (Sharpen.Runtime.EqualsIgnoreCase(relationExtractorClassifierType, "svm")) { SVMLightClassifierFactory <string, string> svmFactory = new SVMLightClassifierFactory <string, string>(); svmFactory.SetC(sigma); classifier = svmFactory.TrainClassifier(trainSet); } else { throw new Exception("Invalid classifier type: " + relationExtractorClassifierType); } } if (logger.IsLoggable(Level.Fine)) { ReportWeights(classifier, null); } }
public static void Main(string[] args) { Edu.Stanford.Nlp.Classify.RVFDataset <string, string> data = new Edu.Stanford.Nlp.Classify.RVFDataset <string, string>(); ClassicCounter <string> c1 = new ClassicCounter <string>(); c1.IncrementCount("fever", 3.5); c1.IncrementCount("cough", 1.1); c1.IncrementCount("congestion", 4.2); ClassicCounter <string> c2 = new ClassicCounter <string>(); c2.IncrementCount("fever", 1.5); c2.IncrementCount("cough", 2.1); c2.IncrementCount("nausea", 3.2); ClassicCounter <string> c3 = new ClassicCounter <string>(); c3.IncrementCount("cough", 2.5); c3.IncrementCount("congestion", 3.2); data.Add(new RVFDatum <string, string>(c1, "cold")); data.Add(new RVFDatum <string, string>(c2, "flu")); data.Add(new RVFDatum <string, string>(c3, "cold")); data.SummaryStatistics(); LinearClassifierFactory <string, string> factory = new LinearClassifierFactory <string, string>(); factory.UseQuasiNewton(); LinearClassifier <string, string> c = factory.TrainClassifier(data); ClassicCounter <string> c4 = new ClassicCounter <string>(); c4.IncrementCount("cough", 2.3); c4.IncrementCount("fever", 1.3); RVFDatum <string, string> datum = new RVFDatum <string, string>(c4); c.JustificationOf((IDatum <string, string>)datum); }
/// <summary>Train a multinomial classifier off of the provided dataset.</summary> /// <param name="dataset">The dataset to train the classifier off of.</param> /// <returns>A classifier.</returns> public static IClassifier <string, string> TrainMultinomialClassifier(GeneralDataset <string, string> dataset, int featureThreshold, double sigma) { // Set up the dataset and factory log.Info("Applying feature threshold (" + featureThreshold + ")..."); dataset.ApplyFeatureCountThreshold(featureThreshold); log.Info("Randomizing dataset..."); dataset.Randomize(42l); log.Info("Creating factory..."); LinearClassifierFactory <string, string> factory = InitFactory(sigma); // Train the final classifier log.Info("BEGIN training"); LinearClassifier <string, string> classifier = factory.TrainClassifier(dataset); log.Info("END training"); // Debug KBPRelationExtractor.Accuracy trainAccuracy = new KBPRelationExtractor.Accuracy(); foreach (IDatum <string, string> datum in dataset) { string guess = classifier.ClassOf(datum); trainAccuracy.Predict(Java.Util.Collections.Singleton(guess), Java.Util.Collections.Singleton(datum.Label())); } log.Info("Training accuracy:"); log.Info(trainAccuracy.ToString()); log.Info(string.Empty); // Return the classifier return(classifier); }
/// <summary>Builds a sigmoid model to turn the classifier outputs into probabilities.</summary> private LinearClassifier <L, L> FitSigmoid(SVMLightClassifier <L, F> classifier, GeneralDataset <L, F> dataset) { RVFDataset <L, L> plattDataset = new RVFDataset <L, L>(); for (int i = 0; i < dataset.Size(); i++) { RVFDatum <L, F> d = dataset.GetRVFDatum(i); ICounter <L> scores = classifier.ScoresOf((IDatum <L, F>)d); scores.IncrementCount(null); plattDataset.Add(new RVFDatum <L, L>(scores, d.Label())); } LinearClassifierFactory <L, L> factory = new LinearClassifierFactory <L, L>(); factory.SetPrior(new LogPrior(LogPrior.LogPriorType.Null)); return(factory.TrainClassifier(plattDataset)); }
/// <summary>Create a classifier factory</summary> /// <?/> /// <returns>A factory to minimize a classifier against.</returns> private static LinearClassifierFactory <L, string> InitFactory <L>(double sigma) { LinearClassifierFactory <L, string> factory = new LinearClassifierFactory <L, string>(); IFactory <IMinimizer <IDiffFunction> > minimizerFactory; switch (minimizer) { case KBPStatisticalExtractor.MinimizerType.Qn: { minimizerFactory = null; break; } case KBPStatisticalExtractor.MinimizerType.Sgd: { minimizerFactory = null; break; } case KBPStatisticalExtractor.MinimizerType.Hybrid: { factory.UseHybridMinimizerWithInPlaceSGD(100, 1000, sigma); minimizerFactory = null; break; } case KBPStatisticalExtractor.MinimizerType.L1: { minimizerFactory = null; break; } default: { throw new InvalidOperationException("Unknown minimizer: " + minimizer); } } factory.SetMinimizerCreator(minimizerFactory); return(factory); }
public virtual void FinishTraining() { IntCounter <string> tagCounter = new IntCounter <string>(); WeightedDataset data = new WeightedDataset(datumCounter.Size()); foreach (TaggedWord word in datumCounter.KeySet()) { int count = datumCounter.GetIntCount(word); if (trainOnLowCount && count > trainCountThreshold) { continue; } if (functionWordTags.Contains(word.Word())) { continue; } tagCounter.IncrementCount(word.Tag()); if (trainByType) { count = 1; } data.Add(new BasicDatum(featExtractor.MakeFeatures(word.Word()), word.Tag()), count); } datumCounter = null; tagDist = Distribution.LaplaceSmoothedDistribution(tagCounter, tagCounter.Size(), 0.5); tagCounter = null; ApplyThresholds(data); Verbose("Making classifier..."); QNMinimizer minim = new QNMinimizer(); //new ResultStoringMonitor(5, "weights")); // minim.shutUp(); LinearClassifierFactory factory = new LinearClassifierFactory(minim); factory.SetTol(tol); factory.SetSigma(sigma); scorer = factory.TrainClassifier(data); Verbose("Done training."); }
public ExtractQuotesClassifier(GeneralDataset <string, string> trainingSet) { LinearClassifierFactory <string, string> lcf = new LinearClassifierFactory <string, string>(); quoteToMentionClassifier = lcf.TrainClassifier(trainingSet); }
/// <summary>Train a sentiment model from a set of data.</summary> /// <param name="data">The data to train the model from.</param> /// <param name="modelLocation"> /// An optional location to save the model. /// Note that this stream will be closed in this method, /// and should not be written to thereafter. /// </param> /// <returns>A sentiment classifier, ready to use.</returns> public static SimpleSentiment Train(IStream <SimpleSentiment.SentimentDatum> data, Optional <OutputStream> modelLocation) { // Some useful variables configuring how we train bool useL1 = true; double sigma = 1.0; int featureCountThreshold = 5; // Featurize the data Redwood.Util.ForceTrack("Featurizing"); RVFDataset <SentimentClass, string> dataset = new RVFDataset <SentimentClass, string>(); AtomicInteger datasize = new AtomicInteger(0); ICounter <SentimentClass> distribution = new ClassicCounter <SentimentClass>(); data.Unordered().Parallel().Map(null).ForEach(null); Redwood.Util.EndTrack("Featurizing"); // Print label distribution Redwood.Util.StartTrack("Distribution"); foreach (SentimentClass label in SentimentClass.Values()) { Redwood.Util.Log(string.Format("%7d", (int)distribution.GetCount(label)) + " " + label); } Redwood.Util.EndTrack("Distribution"); // Train the classifier Redwood.Util.ForceTrack("Training"); if (featureCountThreshold > 1) { dataset.ApplyFeatureCountThreshold(featureCountThreshold); } dataset.Randomize(42L); LinearClassifierFactory <SentimentClass, string> factory = new LinearClassifierFactory <SentimentClass, string>(); factory.SetVerbose(true); try { factory.SetMinimizerCreator(null); } catch (Exception) { } factory.SetSigma(sigma); LinearClassifier <SentimentClass, string> classifier = factory.TrainClassifier(dataset); // Optionally save the model modelLocation.IfPresent(null); Redwood.Util.EndTrack("Training"); // Evaluate the model Redwood.Util.ForceTrack("Evaluating"); factory.SetVerbose(false); double sumAccuracy = 0.0; ICounter <SentimentClass> sumP = new ClassicCounter <SentimentClass>(); ICounter <SentimentClass> sumR = new ClassicCounter <SentimentClass>(); int numFolds = 4; for (int fold = 0; fold < numFolds; ++fold) { Pair <GeneralDataset <SentimentClass, string>, GeneralDataset <SentimentClass, string> > trainTest = dataset.SplitOutFold(fold, numFolds); LinearClassifier <SentimentClass, string> foldClassifier = factory.TrainClassifierWithInitialWeights(trainTest.first, classifier); // convex objective, so this should be OK sumAccuracy += foldClassifier.EvaluateAccuracy(trainTest.second); foreach (SentimentClass label_1 in SentimentClass.Values()) { Pair <double, double> pr = foldClassifier.EvaluatePrecisionAndRecall(trainTest.second, label_1); sumP.IncrementCount(label_1, pr.first); sumP.IncrementCount(label_1, pr.second); } } DecimalFormat df = new DecimalFormat("0.000%"); log.Info("----------"); double aveAccuracy = sumAccuracy / ((double)numFolds); log.Info(string.Empty + numFolds + "-fold accuracy: " + df.Format(aveAccuracy)); log.Info(string.Empty); foreach (SentimentClass label_2 in SentimentClass.Values()) { double p = sumP.GetCount(label_2) / numFolds; double r = sumR.GetCount(label_2) / numFolds; log.Info(label_2 + " (P) = " + df.Format(p)); log.Info(label_2 + " (R) = " + df.Format(r)); log.Info(label_2 + " (F1) = " + df.Format(2 * p * r / (p + r))); log.Info(string.Empty); } log.Info("----------"); Redwood.Util.EndTrack("Evaluating"); // Return return(new SimpleSentiment(classifier)); }