예제 #1
0
        /// <summary>Train a multinomial classifier off of the provided dataset.</summary>
        /// <param name="dataset">The dataset to train the classifier off of.</param>
        /// <returns>A classifier.</returns>
        public static IClassifier <string, string> TrainMultinomialClassifier(GeneralDataset <string, string> dataset, int featureThreshold, double sigma)
        {
            // Set up the dataset and factory
            log.Info("Applying feature threshold (" + featureThreshold + ")...");
            dataset.ApplyFeatureCountThreshold(featureThreshold);
            log.Info("Randomizing dataset...");
            dataset.Randomize(42l);
            log.Info("Creating factory...");
            LinearClassifierFactory <string, string> factory = InitFactory(sigma);

            // Train the final classifier
            log.Info("BEGIN training");
            LinearClassifier <string, string> classifier = factory.TrainClassifier(dataset);

            log.Info("END training");
            // Debug
            KBPRelationExtractor.Accuracy trainAccuracy = new KBPRelationExtractor.Accuracy();
            foreach (IDatum <string, string> datum in dataset)
            {
                string guess = classifier.ClassOf(datum);
                trainAccuracy.Predict(Java.Util.Collections.Singleton(guess), Java.Util.Collections.Singleton(datum.Label()));
            }
            log.Info("Training accuracy:");
            log.Info(trainAccuracy.ToString());
            log.Info(string.Empty);
            // Return the classifier
            return(classifier);
        }
예제 #2
0
        /// <summary>Train on a list of ExtractionSentence containing labeled RelationMention objects</summary>
        public virtual void Train(Annotation sentences)
        {
            // Train a single multi-class classifier
            GeneralDataset <string, string> trainSet = CreateDataset(sentences);

            TrainMulticlass(trainSet);
        }
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            Properties props;

            if (args.Length > 0)
            {
                props = StringUtils.ArgsToProperties(args);
            }
            else
            {
                props = new Properties();
            }
            if (!props.Contains("dcoref.conll2011"))
            {
                log.Info("-dcoref.conll2011 [input_CoNLL_corpus]: was not specified");
                return;
            }
            if (!props.Contains("singleton.predictor.output"))
            {
                log.Info("-singleton.predictor.output [output_model_file]: was not specified");
                return;
            }
            GeneralDataset <string, string>     data       = Edu.Stanford.Nlp.Coref.Misc.SingletonPredictor.GenerateFeatureVectors(props);
            LogisticClassifier <string, string> classifier = Edu.Stanford.Nlp.Coref.Misc.SingletonPredictor.Train(data);

            Edu.Stanford.Nlp.Coref.Misc.SingletonPredictor.SaveToSerialized(classifier, GetPathSingletonPredictor(props));
        }
예제 #4
0
 public virtual void TrainMulticlass(GeneralDataset <string, string> trainSet)
 {
     if (Sharpen.Runtime.EqualsIgnoreCase(relationExtractorClassifierType, "linear"))
     {
         LinearClassifierFactory <string, string> lcFactory = new LinearClassifierFactory <string, string>(1e-4, false, sigma);
         lcFactory.SetVerbose(false);
         // use in-place SGD instead of QN. this is faster but much worse!
         // lcFactory.useInPlaceStochasticGradientDescent(-1, -1, 1.0);
         // use a hybrid minimizer: start with in-place SGD, continue with QN
         // lcFactory.useHybridMinimizerWithInPlaceSGD(50, -1, sigma);
         classifier = lcFactory.TrainClassifier(trainSet);
     }
     else
     {
         if (Sharpen.Runtime.EqualsIgnoreCase(relationExtractorClassifierType, "svm"))
         {
             SVMLightClassifierFactory <string, string> svmFactory = new SVMLightClassifierFactory <string, string>();
             svmFactory.SetC(sigma);
             classifier = svmFactory.TrainClassifier(trainSet);
         }
         else
         {
             throw new Exception("Invalid classifier type: " + relationExtractorClassifierType);
         }
     }
     if (logger.IsLoggable(Level.Fine))
     {
         ReportWeights(classifier, null);
     }
 }
        /// <summary>Train the singleton predictor using a logistic regression classifier.</summary>
        /// <param name="pDataset">Dataset of features</param>
        /// <returns>Singleton predictor</returns>
        public static LogisticClassifier <string, string> Train(GeneralDataset <string, string> pDataset)
        {
            LogisticClassifierFactory <string, string> lcf        = new LogisticClassifierFactory <string, string>();
            LogisticClassifier <string, string>        classifier = lcf.TrainClassifier(pDataset);

            return(classifier);
        }
        public virtual double Score <F>(IClassifier <L, F> classifier, GeneralDataset <L, F> data)
        {
            IList <L> guesses = new List <L>();
            IList <L> labels  = new List <L>();

            for (int i = 0; i < data.Size(); i++)
            {
                IDatum <L, F> d     = data.GetRVFDatum(i);
                L             guess = classifier.ClassOf(d);
                guesses.Add(guess);
            }
            int[] labelsArr = data.GetLabelsArray();
            labelIndex = data.labelIndex;
            for (int i_1 = 0; i_1 < data.Size(); i_1++)
            {
                labels.Add(labelIndex.Get(labelsArr[i_1]));
            }
            labelIndex = new HashIndex <L>();
            labelIndex.AddAll(data.LabelIndex().ObjectsList());
            labelIndex.AddAll(classifier.Labels());
            int numClasses = labelIndex.Size();

            tpCount  = new int[numClasses];
            fpCount  = new int[numClasses];
            fnCount  = new int[numClasses];
            negIndex = labelIndex.IndexOf(negLabel);
            for (int i_2 = 0; i_2 < guesses.Count; ++i_2)
            {
                L   guess      = guesses[i_2];
                int guessIndex = labelIndex.IndexOf(guess);
                L   label      = labels[i_2];
                int trueIndex  = labelIndex.IndexOf(label);
                if (guessIndex == trueIndex)
                {
                    if (guessIndex != negIndex)
                    {
                        tpCount[guessIndex]++;
                    }
                }
                else
                {
                    if (guessIndex != negIndex)
                    {
                        fpCount[guessIndex]++;
                    }
                    if (trueIndex != negIndex)
                    {
                        fnCount[trueIndex]++;
                    }
                }
            }
            return(GetFMeasure());
        }
예제 #7
0
 public override double Score <F>(IClassifier <L, F> classifier, GeneralDataset <L, F> data)
 {
     labelIndex = new HashIndex <L>();
     labelIndex.AddAll(classifier.Labels());
     labelIndex.AddAll(data.labelIndex.ObjectsList());
     ClearCounts();
     int[] labelsArr = data.GetLabelsArray();
     for (int i = 0; i < data.Size(); i++)
     {
         IDatum <L, F> d     = data.GetRVFDatum(i);
         L             guess = classifier.ClassOf(d);
         AddGuess(guess, labelIndex.Get(labelsArr[i]));
     }
     FinalizeCounts();
     return(GetFMeasure());
 }
        public virtual void InitMC <F>(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data)
        {
            //if (!(gData instanceof Dataset)) {
            //  throw new UnsupportedOperationException("Can only handle Datasets, not "+gData.getClass().getName());
            //}
            //
            //Dataset data = (Dataset)gData;
            IPriorityQueue <Pair <int, Pair <double, bool> > > q = new BinaryHeapPriorityQueue <Pair <int, Pair <double, bool> > >();

            total         = 0;
            correct       = 0;
            logLikelihood = 0.0;
            for (int i = 0; i < data.Size(); i++)
            {
                IDatum <L, F> d            = data.GetRVFDatum(i);
                ICounter <L>  scores       = classifier.LogProbabilityOf(d);
                L             guess        = Counters.Argmax(scores);
                L             correctLab   = d.Label();
                double        guessScore   = scores.GetCount(guess);
                double        correctScore = scores.GetCount(correctLab);
                int           guessInd     = data.LabelIndex().IndexOf(guess);
                int           correctInd   = data.LabelIndex().IndexOf(correctLab);
                total++;
                if (guessInd == correctInd)
                {
                    correct++;
                }
                logLikelihood += correctScore;
                q.Add(new Pair <int, Pair <double, bool> >(int.Parse(i), new Pair <double, bool>(guessScore, bool.ValueOf(guessInd == correctInd))), -guessScore);
            }
            accuracy = (double)correct / (double)total;
            IList <Pair <int, Pair <double, bool> > > sorted = q.ToSortedList();

            scores    = new double[sorted.Count];
            isCorrect = new bool[sorted.Count];
            for (int i_1 = 0; i_1 < sorted.Count; i_1++)
            {
                Pair <double, bool> next = sorted[i_1].Second();
                scores[i_1]    = next.First();
                isCorrect[i_1] = next.Second();
            }
        }
예제 #9
0
        public virtual double Score <F>(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data)
        {
            List <Pair <double, int> > dataScores = new List <Pair <double, int> >();

            for (int i = 0; i < data.Size(); i++)
            {
                IDatum <L, F> d      = data.GetRVFDatum(i);
                ICounter <L>  scores = classifier.LogProbabilityOf(d);
                int           labelD = d.Label().Equals(posLabel) ? 1 : 0;
                dataScores.Add(new Pair <double, int>(Math.Exp(scores.GetCount(posLabel)), labelD));
            }
            PRCurve prc = new PRCurve(dataScores);

            confWeightedAccuracy    = prc.Cwa();
            accuracy                = prc.Accuracy();
            optAccuracy             = prc.OptimalAccuracy();
            optConfWeightedAccuracy = prc.OptimalCwa();
            logLikelihood           = prc.LogLikelihood();
            accrecall               = prc.CwaArray();
            optaccrecall            = prc.OptimalCwaArray();
            return(accuracy);
        }
예제 #10
0
 public MultiClassChunkEvalStats(IClassifier <string, F> classifier, GeneralDataset <string, F> data, string negLabel)
     : base(classifier, data, negLabel)
 {
     chunker = new LabeledChunkIdentifier();
     chunker.SetNegLabel(negLabel);
 }
 public BiasedLogConditionalObjectiveFunction(GeneralDataset dataset, double[][] confusionMatrix)
     : this(dataset, confusionMatrix, new LogPrior(LogPrior.LogPriorType.QUADRATIC))
 {
 }
예제 #12
0
 public MultiClassStringLabelStats(IClassifier <string, F> classifier, GeneralDataset <string, F> data, string negLabel)
     : base(classifier, data, negLabel)
 {
     stringConverter = new MultiClassPrecisionRecallExtendedStats.StringStringConverter();
 }
 public BiasedLogConditionalObjectiveFunction(GeneralDataset dataset, double[][] confusionMatrix, LogPrior prior)
     : this(dataset.NumFeatures(), dataset.NumClasses(), dataset.GetDataArray(), dataset.GetLabelsArray(), confusionMatrix, prior)
 {
 }
 public MultiClassAccuracyStats(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data, string file, int scoreType)
 {
     saveFile       = file;
     this.scoreType = scoreType;
     InitMC(classifier, data);
 }
예제 #15
0
        /// <summary>A helper function for dumping the accuracy of the trained classifier.</summary>
        /// <param name="classifier">The classifier to evaluate.</param>
        /// <param name="dataset">The dataset to evaluate the classifier on.</param>
        public static void DumpAccuracy(IClassifier <ClauseSplitter.ClauseClassifierLabel, string> classifier, GeneralDataset <ClauseSplitter.ClauseClassifierLabel, string> dataset)
        {
            DecimalFormat df = new DecimalFormat("0.00%");

            Redwood.Log("size:         " + dataset.Size());
            Redwood.Log("split count:  " + StreamSupport.Stream(dataset.Spliterator(), false).Filter(null).Collect(Collectors.ToList()).Count);
            Redwood.Log("interm count: " + StreamSupport.Stream(dataset.Spliterator(), false).Filter(null).Collect(Collectors.ToList()).Count);
            Pair <double, double> pr = classifier.EvaluatePrecisionAndRecall(dataset, ClauseSplitter.ClauseClassifierLabel.ClauseSplit);

            Redwood.Log("p  (split):   " + df.Format(pr.first));
            Redwood.Log("r  (split):   " + df.Format(pr.second));
            Redwood.Log("f1 (split):   " + df.Format(2 * pr.first * pr.second / (pr.first + pr.second)));
            pr = classifier.EvaluatePrecisionAndRecall(dataset, ClauseSplitter.ClauseClassifierLabel.ClauseInterm);
            Redwood.Log("p  (interm):  " + df.Format(pr.first));
            Redwood.Log("r  (interm):  " + df.Format(pr.second));
            Redwood.Log("f1 (interm):  " + df.Format(2 * pr.first * pr.second / (pr.first + pr.second)));
        }
 public virtual double Score <F>(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data)
 {
     return(Score((IClassifier <L, F>)classifier, data));
 }
 public MultiClassPrecisionRecallStats(IClassifier <L, F> classifier, GeneralDataset <L, F> data, L negLabel)
 {
     this.negLabel = negLabel;
     Score(classifier, data);
 }
 public FeaturesData(IDictionary <int, Pair <int, int> > mapQuoteToDataRange, IDictionary <int, Sieve.MentionData> mapDatumToMention, GeneralDataset <string, string> dataset)
 {
     this.mapQuoteToDataRange = mapQuoteToDataRange;
     this.mapDatumToMention   = mapDatumToMention;
     this.dataset             = dataset;
 }
예제 #19
0
 public AccuracyStats(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data, L posLabel)
 {
     // = null;
     this.posLabel = posLabel;
     Score(classifier, data);
 }
 public MultiClassAccuracyStats(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data, string file)
     : this(classifier, data, file, UseAccuracy)
 {
 }
예제 #21
0
 public MultiClassPrecisionRecallExtendedStats(IClassifier <L, F> classifier, GeneralDataset <L, F> data, L negLabel)
     : base(classifier, data, negLabel)
 {
 }
 public virtual double Score <F>(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data)
 {
     InitMC(classifier, data);
     return(Score());
 }
        public ExtractQuotesClassifier(GeneralDataset <string, string> trainingSet)
        {
            LinearClassifierFactory <string, string> lcf = new LinearClassifierFactory <string, string>();

            quoteToMentionClassifier = lcf.TrainClassifier(trainingSet);
        }