public static void TestDataset()
        {
            Dataset <string, string> data = new Dataset <string, string>();

            data.Add(new BasicDatum <string, string>(Arrays.AsList(new string[] { "fever", "cough", "congestion" }), "cold"));
            data.Add(new BasicDatum <string, string>(Arrays.AsList(new string[] { "fever", "cough", "nausea" }), "flu"));
            data.Add(new BasicDatum <string, string>(Arrays.AsList(new string[] { "cough", "congestion" }), "cold"));
            // data.summaryStatistics();
            NUnit.Framework.Assert.AreEqual(4, data.NumFeatures());
            NUnit.Framework.Assert.AreEqual(4, data.NumFeatureTypes());
            NUnit.Framework.Assert.AreEqual(2, data.NumClasses());
            NUnit.Framework.Assert.AreEqual(8, data.NumFeatureTokens());
            NUnit.Framework.Assert.AreEqual(3, data.Size());
            data.ApplyFeatureCountThreshold(2);
            NUnit.Framework.Assert.AreEqual(3, data.NumFeatures());
            NUnit.Framework.Assert.AreEqual(3, data.NumFeatureTypes());
            NUnit.Framework.Assert.AreEqual(2, data.NumClasses());
            NUnit.Framework.Assert.AreEqual(7, data.NumFeatureTokens());
            NUnit.Framework.Assert.AreEqual(3, data.Size());
            //Dataset data = Dataset.readSVMLightFormat(args[0]);
            //double[] scores = data.getInformationGains();
            //System.out.println(ArrayMath.mean(scores));
            //System.out.println(ArrayMath.variance(scores));
            LinearClassifierFactory <string, string> factory    = new LinearClassifierFactory <string, string>();
            LinearClassifier <string, string>        classifier = factory.TrainClassifier(data);
            IDatum <string, string> d = new BasicDatum <string, string>(Arrays.AsList(new string[] { "cough", "fever" }));

            NUnit.Framework.Assert.AreEqual("Classification incorrect", "flu", classifier.ClassOf(d));
            ICounter <string> probs = classifier.ProbabilityOf(d);

            NUnit.Framework.Assert.AreEqual("Returned probability incorrect", 0.4553, probs.GetCount("cold"), 0.0001);
            NUnit.Framework.Assert.AreEqual("Returned probability incorrect", 0.5447, probs.GetCount("flu"), 0.0001);
            System.Console.Out.WriteLine();
        }
Esempio n. 2
0
        public static void Main(string[] args)
        {
            // Create a training set
            IList <IDatum <string, string> > trainingData = new List <IDatum <string, string> >();

            trainingData.Add(MakeStopLights(Green, Red));
            trainingData.Add(MakeStopLights(Green, Red));
            trainingData.Add(MakeStopLights(Green, Red));
            trainingData.Add(MakeStopLights(Red, Green));
            trainingData.Add(MakeStopLights(Red, Green));
            trainingData.Add(MakeStopLights(Red, Green));
            trainingData.Add(MakeStopLights(Red, Red));
            // Create a test set
            IDatum <string, string> workingLights = MakeStopLights(Green, Red);
            IDatum <string, string> brokenLights  = MakeStopLights(Red, Red);
            // Build a classifier factory
            LinearClassifierFactory <string, string> factory = new LinearClassifierFactory <string, string>();

            factory.UseConjugateGradientAscent();
            // Turn on per-iteration convergence updates
            factory.SetVerbose(true);
            //Small amount of smoothing
            factory.SetSigma(10.0);
            // Build a classifier
            LinearClassifier <string, string> classifier = factory.TrainClassifier(trainingData);

            // Check out the learned weights
            classifier.Dump();
            // Test the classifier
            System.Console.Out.WriteLine("Working instance got: " + classifier.ClassOf(workingLights));
            classifier.JustificationOf(workingLights);
            System.Console.Out.WriteLine("Broken instance got: " + classifier.ClassOf(brokenLights));
            classifier.JustificationOf(brokenLights);
        }
Esempio n. 3
0
 public virtual void TrainMulticlass(GeneralDataset <string, string> trainSet)
 {
     if (Sharpen.Runtime.EqualsIgnoreCase(relationExtractorClassifierType, "linear"))
     {
         LinearClassifierFactory <string, string> lcFactory = new LinearClassifierFactory <string, string>(1e-4, false, sigma);
         lcFactory.SetVerbose(false);
         // use in-place SGD instead of QN. this is faster but much worse!
         // lcFactory.useInPlaceStochasticGradientDescent(-1, -1, 1.0);
         // use a hybrid minimizer: start with in-place SGD, continue with QN
         // lcFactory.useHybridMinimizerWithInPlaceSGD(50, -1, sigma);
         classifier = lcFactory.TrainClassifier(trainSet);
     }
     else
     {
         if (Sharpen.Runtime.EqualsIgnoreCase(relationExtractorClassifierType, "svm"))
         {
             SVMLightClassifierFactory <string, string> svmFactory = new SVMLightClassifierFactory <string, string>();
             svmFactory.SetC(sigma);
             classifier = svmFactory.TrainClassifier(trainSet);
         }
         else
         {
             throw new Exception("Invalid classifier type: " + relationExtractorClassifierType);
         }
     }
     if (logger.IsLoggable(Level.Fine))
     {
         ReportWeights(classifier, null);
     }
 }
        public static void Main(string[] args)
        {
            Edu.Stanford.Nlp.Classify.RVFDataset <string, string> data = new Edu.Stanford.Nlp.Classify.RVFDataset <string, string>();
            ClassicCounter <string> c1 = new ClassicCounter <string>();

            c1.IncrementCount("fever", 3.5);
            c1.IncrementCount("cough", 1.1);
            c1.IncrementCount("congestion", 4.2);
            ClassicCounter <string> c2 = new ClassicCounter <string>();

            c2.IncrementCount("fever", 1.5);
            c2.IncrementCount("cough", 2.1);
            c2.IncrementCount("nausea", 3.2);
            ClassicCounter <string> c3 = new ClassicCounter <string>();

            c3.IncrementCount("cough", 2.5);
            c3.IncrementCount("congestion", 3.2);
            data.Add(new RVFDatum <string, string>(c1, "cold"));
            data.Add(new RVFDatum <string, string>(c2, "flu"));
            data.Add(new RVFDatum <string, string>(c3, "cold"));
            data.SummaryStatistics();
            LinearClassifierFactory <string, string> factory = new LinearClassifierFactory <string, string>();

            factory.UseQuasiNewton();
            LinearClassifier <string, string> c  = factory.TrainClassifier(data);
            ClassicCounter <string>           c4 = new ClassicCounter <string>();

            c4.IncrementCount("cough", 2.3);
            c4.IncrementCount("fever", 1.3);
            RVFDatum <string, string> datum = new RVFDatum <string, string>(c4);

            c.JustificationOf((IDatum <string, string>)datum);
        }
Esempio n. 5
0
        /// <summary>Train a multinomial classifier off of the provided dataset.</summary>
        /// <param name="dataset">The dataset to train the classifier off of.</param>
        /// <returns>A classifier.</returns>
        public static IClassifier <string, string> TrainMultinomialClassifier(GeneralDataset <string, string> dataset, int featureThreshold, double sigma)
        {
            // Set up the dataset and factory
            log.Info("Applying feature threshold (" + featureThreshold + ")...");
            dataset.ApplyFeatureCountThreshold(featureThreshold);
            log.Info("Randomizing dataset...");
            dataset.Randomize(42l);
            log.Info("Creating factory...");
            LinearClassifierFactory <string, string> factory = InitFactory(sigma);

            // Train the final classifier
            log.Info("BEGIN training");
            LinearClassifier <string, string> classifier = factory.TrainClassifier(dataset);

            log.Info("END training");
            // Debug
            KBPRelationExtractor.Accuracy trainAccuracy = new KBPRelationExtractor.Accuracy();
            foreach (IDatum <string, string> datum in dataset)
            {
                string guess = classifier.ClassOf(datum);
                trainAccuracy.Predict(Java.Util.Collections.Singleton(guess), Java.Util.Collections.Singleton(datum.Label()));
            }
            log.Info("Training accuracy:");
            log.Info(trainAccuracy.ToString());
            log.Info(string.Empty);
            // Return the classifier
            return(classifier);
        }
Esempio n. 6
0
        /// <summary>Builds a sigmoid model to turn the classifier outputs into probabilities.</summary>
        private LinearClassifier <L, L> FitSigmoid(SVMLightClassifier <L, F> classifier, GeneralDataset <L, F> dataset)
        {
            RVFDataset <L, L> plattDataset = new RVFDataset <L, L>();

            for (int i = 0; i < dataset.Size(); i++)
            {
                RVFDatum <L, F> d      = dataset.GetRVFDatum(i);
                ICounter <L>    scores = classifier.ScoresOf((IDatum <L, F>)d);
                scores.IncrementCount(null);
                plattDataset.Add(new RVFDatum <L, L>(scores, d.Label()));
            }
            LinearClassifierFactory <L, L> factory = new LinearClassifierFactory <L, L>();

            factory.SetPrior(new LogPrior(LogPrior.LogPriorType.Null));
            return(factory.TrainClassifier(plattDataset));
        }
Esempio n. 7
0
        /// <summary>Create a classifier factory</summary>
        /// <?/>
        /// <returns>A factory to minimize a classifier against.</returns>
        private static LinearClassifierFactory <L, string> InitFactory <L>(double sigma)
        {
            LinearClassifierFactory <L, string>    factory = new LinearClassifierFactory <L, string>();
            IFactory <IMinimizer <IDiffFunction> > minimizerFactory;

            switch (minimizer)
            {
            case KBPStatisticalExtractor.MinimizerType.Qn:
            {
                minimizerFactory = null;
                break;
            }

            case KBPStatisticalExtractor.MinimizerType.Sgd:
            {
                minimizerFactory = null;
                break;
            }

            case KBPStatisticalExtractor.MinimizerType.Hybrid:
            {
                factory.UseHybridMinimizerWithInPlaceSGD(100, 1000, sigma);
                minimizerFactory = null;
                break;
            }

            case KBPStatisticalExtractor.MinimizerType.L1:
            {
                minimizerFactory = null;
                break;
            }

            default:
            {
                throw new InvalidOperationException("Unknown minimizer: " + minimizer);
            }
            }
            factory.SetMinimizerCreator(minimizerFactory);
            return(factory);
        }
Esempio n. 8
0
        public virtual void FinishTraining()
        {
            IntCounter <string> tagCounter = new IntCounter <string>();
            WeightedDataset     data       = new WeightedDataset(datumCounter.Size());

            foreach (TaggedWord word in datumCounter.KeySet())
            {
                int count = datumCounter.GetIntCount(word);
                if (trainOnLowCount && count > trainCountThreshold)
                {
                    continue;
                }
                if (functionWordTags.Contains(word.Word()))
                {
                    continue;
                }
                tagCounter.IncrementCount(word.Tag());
                if (trainByType)
                {
                    count = 1;
                }
                data.Add(new BasicDatum(featExtractor.MakeFeatures(word.Word()), word.Tag()), count);
            }
            datumCounter = null;
            tagDist      = Distribution.LaplaceSmoothedDistribution(tagCounter, tagCounter.Size(), 0.5);
            tagCounter   = null;
            ApplyThresholds(data);
            Verbose("Making classifier...");
            QNMinimizer minim = new QNMinimizer();
            //new ResultStoringMonitor(5, "weights"));
            //    minim.shutUp();
            LinearClassifierFactory factory = new LinearClassifierFactory(minim);

            factory.SetTol(tol);
            factory.SetSigma(sigma);
            scorer = factory.TrainClassifier(data);
            Verbose("Done training.");
        }
        public ExtractQuotesClassifier(GeneralDataset <string, string> trainingSet)
        {
            LinearClassifierFactory <string, string> lcf = new LinearClassifierFactory <string, string>();

            quoteToMentionClassifier = lcf.TrainClassifier(trainingSet);
        }
Esempio n. 10
0
        /// <summary>Train a sentiment model from a set of data.</summary>
        /// <param name="data">The data to train the model from.</param>
        /// <param name="modelLocation">
        /// An optional location to save the model.
        /// Note that this stream will be closed in this method,
        /// and should not be written to thereafter.
        /// </param>
        /// <returns>A sentiment classifier, ready to use.</returns>
        public static SimpleSentiment Train(IStream <SimpleSentiment.SentimentDatum> data, Optional <OutputStream> modelLocation)
        {
            // Some useful variables configuring how we train
            bool   useL1 = true;
            double sigma = 1.0;
            int    featureCountThreshold = 5;

            // Featurize the data
            Redwood.Util.ForceTrack("Featurizing");
            RVFDataset <SentimentClass, string> dataset = new RVFDataset <SentimentClass, string>();
            AtomicInteger             datasize          = new AtomicInteger(0);
            ICounter <SentimentClass> distribution      = new ClassicCounter <SentimentClass>();

            data.Unordered().Parallel().Map(null).ForEach(null);
            Redwood.Util.EndTrack("Featurizing");
            // Print label distribution
            Redwood.Util.StartTrack("Distribution");
            foreach (SentimentClass label in SentimentClass.Values())
            {
                Redwood.Util.Log(string.Format("%7d", (int)distribution.GetCount(label)) + "   " + label);
            }
            Redwood.Util.EndTrack("Distribution");
            // Train the classifier
            Redwood.Util.ForceTrack("Training");
            if (featureCountThreshold > 1)
            {
                dataset.ApplyFeatureCountThreshold(featureCountThreshold);
            }
            dataset.Randomize(42L);
            LinearClassifierFactory <SentimentClass, string> factory = new LinearClassifierFactory <SentimentClass, string>();

            factory.SetVerbose(true);
            try
            {
                factory.SetMinimizerCreator(null);
            }
            catch (Exception)
            {
            }
            factory.SetSigma(sigma);
            LinearClassifier <SentimentClass, string> classifier = factory.TrainClassifier(dataset);

            // Optionally save the model
            modelLocation.IfPresent(null);
            Redwood.Util.EndTrack("Training");
            // Evaluate the model
            Redwood.Util.ForceTrack("Evaluating");
            factory.SetVerbose(false);
            double sumAccuracy             = 0.0;
            ICounter <SentimentClass> sumP = new ClassicCounter <SentimentClass>();
            ICounter <SentimentClass> sumR = new ClassicCounter <SentimentClass>();
            int numFolds = 4;

            for (int fold = 0; fold < numFolds; ++fold)
            {
                Pair <GeneralDataset <SentimentClass, string>, GeneralDataset <SentimentClass, string> > trainTest = dataset.SplitOutFold(fold, numFolds);
                LinearClassifier <SentimentClass, string> foldClassifier = factory.TrainClassifierWithInitialWeights(trainTest.first, classifier);
                // convex objective, so this should be OK
                sumAccuracy += foldClassifier.EvaluateAccuracy(trainTest.second);
                foreach (SentimentClass label_1 in SentimentClass.Values())
                {
                    Pair <double, double> pr = foldClassifier.EvaluatePrecisionAndRecall(trainTest.second, label_1);
                    sumP.IncrementCount(label_1, pr.first);
                    sumP.IncrementCount(label_1, pr.second);
                }
            }
            DecimalFormat df = new DecimalFormat("0.000%");

            log.Info("----------");
            double aveAccuracy = sumAccuracy / ((double)numFolds);

            log.Info(string.Empty + numFolds + "-fold accuracy: " + df.Format(aveAccuracy));
            log.Info(string.Empty);
            foreach (SentimentClass label_2 in SentimentClass.Values())
            {
                double p = sumP.GetCount(label_2) / numFolds;
                double r = sumR.GetCount(label_2) / numFolds;
                log.Info(label_2 + " (P)  = " + df.Format(p));
                log.Info(label_2 + " (R)  = " + df.Format(r));
                log.Info(label_2 + " (F1) = " + df.Format(2 * p * r / (p + r)));
                log.Info(string.Empty);
            }
            log.Info("----------");
            Redwood.Util.EndTrack("Evaluating");
            // Return
            return(new SimpleSentiment(classifier));
        }