/// <summary>Train a multinomial classifier off of the provided dataset.</summary> /// <param name="dataset">The dataset to train the classifier off of.</param> /// <returns>A classifier.</returns> public static IClassifier <string, string> TrainMultinomialClassifier(GeneralDataset <string, string> dataset, int featureThreshold, double sigma) { // Set up the dataset and factory log.Info("Applying feature threshold (" + featureThreshold + ")..."); dataset.ApplyFeatureCountThreshold(featureThreshold); log.Info("Randomizing dataset..."); dataset.Randomize(42l); log.Info("Creating factory..."); LinearClassifierFactory <string, string> factory = InitFactory(sigma); // Train the final classifier log.Info("BEGIN training"); LinearClassifier <string, string> classifier = factory.TrainClassifier(dataset); log.Info("END training"); // Debug KBPRelationExtractor.Accuracy trainAccuracy = new KBPRelationExtractor.Accuracy(); foreach (IDatum <string, string> datum in dataset) { string guess = classifier.ClassOf(datum); trainAccuracy.Predict(Java.Util.Collections.Singleton(guess), Java.Util.Collections.Singleton(datum.Label())); } log.Info("Training accuracy:"); log.Info(trainAccuracy.ToString()); log.Info(string.Empty); // Return the classifier return(classifier); }
/// <summary>Train on a list of ExtractionSentence containing labeled RelationMention objects</summary> public virtual void Train(Annotation sentences) { // Train a single multi-class classifier GeneralDataset <string, string> trainSet = CreateDataset(sentences); TrainMulticlass(trainSet); }
/// <exception cref="System.Exception"/> public static void Main(string[] args) { Properties props; if (args.Length > 0) { props = StringUtils.ArgsToProperties(args); } else { props = new Properties(); } if (!props.Contains("dcoref.conll2011")) { log.Info("-dcoref.conll2011 [input_CoNLL_corpus]: was not specified"); return; } if (!props.Contains("singleton.predictor.output")) { log.Info("-singleton.predictor.output [output_model_file]: was not specified"); return; } GeneralDataset <string, string> data = Edu.Stanford.Nlp.Coref.Misc.SingletonPredictor.GenerateFeatureVectors(props); LogisticClassifier <string, string> classifier = Edu.Stanford.Nlp.Coref.Misc.SingletonPredictor.Train(data); Edu.Stanford.Nlp.Coref.Misc.SingletonPredictor.SaveToSerialized(classifier, GetPathSingletonPredictor(props)); }
public virtual void TrainMulticlass(GeneralDataset <string, string> trainSet) { if (Sharpen.Runtime.EqualsIgnoreCase(relationExtractorClassifierType, "linear")) { LinearClassifierFactory <string, string> lcFactory = new LinearClassifierFactory <string, string>(1e-4, false, sigma); lcFactory.SetVerbose(false); // use in-place SGD instead of QN. this is faster but much worse! // lcFactory.useInPlaceStochasticGradientDescent(-1, -1, 1.0); // use a hybrid minimizer: start with in-place SGD, continue with QN // lcFactory.useHybridMinimizerWithInPlaceSGD(50, -1, sigma); classifier = lcFactory.TrainClassifier(trainSet); } else { if (Sharpen.Runtime.EqualsIgnoreCase(relationExtractorClassifierType, "svm")) { SVMLightClassifierFactory <string, string> svmFactory = new SVMLightClassifierFactory <string, string>(); svmFactory.SetC(sigma); classifier = svmFactory.TrainClassifier(trainSet); } else { throw new Exception("Invalid classifier type: " + relationExtractorClassifierType); } } if (logger.IsLoggable(Level.Fine)) { ReportWeights(classifier, null); } }
/// <summary>Train the singleton predictor using a logistic regression classifier.</summary> /// <param name="pDataset">Dataset of features</param> /// <returns>Singleton predictor</returns> public static LogisticClassifier <string, string> Train(GeneralDataset <string, string> pDataset) { LogisticClassifierFactory <string, string> lcf = new LogisticClassifierFactory <string, string>(); LogisticClassifier <string, string> classifier = lcf.TrainClassifier(pDataset); return(classifier); }
public virtual double Score <F>(IClassifier <L, F> classifier, GeneralDataset <L, F> data) { IList <L> guesses = new List <L>(); IList <L> labels = new List <L>(); for (int i = 0; i < data.Size(); i++) { IDatum <L, F> d = data.GetRVFDatum(i); L guess = classifier.ClassOf(d); guesses.Add(guess); } int[] labelsArr = data.GetLabelsArray(); labelIndex = data.labelIndex; for (int i_1 = 0; i_1 < data.Size(); i_1++) { labels.Add(labelIndex.Get(labelsArr[i_1])); } labelIndex = new HashIndex <L>(); labelIndex.AddAll(data.LabelIndex().ObjectsList()); labelIndex.AddAll(classifier.Labels()); int numClasses = labelIndex.Size(); tpCount = new int[numClasses]; fpCount = new int[numClasses]; fnCount = new int[numClasses]; negIndex = labelIndex.IndexOf(negLabel); for (int i_2 = 0; i_2 < guesses.Count; ++i_2) { L guess = guesses[i_2]; int guessIndex = labelIndex.IndexOf(guess); L label = labels[i_2]; int trueIndex = labelIndex.IndexOf(label); if (guessIndex == trueIndex) { if (guessIndex != negIndex) { tpCount[guessIndex]++; } } else { if (guessIndex != negIndex) { fpCount[guessIndex]++; } if (trueIndex != negIndex) { fnCount[trueIndex]++; } } } return(GetFMeasure()); }
public override double Score <F>(IClassifier <L, F> classifier, GeneralDataset <L, F> data) { labelIndex = new HashIndex <L>(); labelIndex.AddAll(classifier.Labels()); labelIndex.AddAll(data.labelIndex.ObjectsList()); ClearCounts(); int[] labelsArr = data.GetLabelsArray(); for (int i = 0; i < data.Size(); i++) { IDatum <L, F> d = data.GetRVFDatum(i); L guess = classifier.ClassOf(d); AddGuess(guess, labelIndex.Get(labelsArr[i])); } FinalizeCounts(); return(GetFMeasure()); }
public virtual void InitMC <F>(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data) { //if (!(gData instanceof Dataset)) { // throw new UnsupportedOperationException("Can only handle Datasets, not "+gData.getClass().getName()); //} // //Dataset data = (Dataset)gData; IPriorityQueue <Pair <int, Pair <double, bool> > > q = new BinaryHeapPriorityQueue <Pair <int, Pair <double, bool> > >(); total = 0; correct = 0; logLikelihood = 0.0; for (int i = 0; i < data.Size(); i++) { IDatum <L, F> d = data.GetRVFDatum(i); ICounter <L> scores = classifier.LogProbabilityOf(d); L guess = Counters.Argmax(scores); L correctLab = d.Label(); double guessScore = scores.GetCount(guess); double correctScore = scores.GetCount(correctLab); int guessInd = data.LabelIndex().IndexOf(guess); int correctInd = data.LabelIndex().IndexOf(correctLab); total++; if (guessInd == correctInd) { correct++; } logLikelihood += correctScore; q.Add(new Pair <int, Pair <double, bool> >(int.Parse(i), new Pair <double, bool>(guessScore, bool.ValueOf(guessInd == correctInd))), -guessScore); } accuracy = (double)correct / (double)total; IList <Pair <int, Pair <double, bool> > > sorted = q.ToSortedList(); scores = new double[sorted.Count]; isCorrect = new bool[sorted.Count]; for (int i_1 = 0; i_1 < sorted.Count; i_1++) { Pair <double, bool> next = sorted[i_1].Second(); scores[i_1] = next.First(); isCorrect[i_1] = next.Second(); } }
public virtual double Score <F>(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data) { List <Pair <double, int> > dataScores = new List <Pair <double, int> >(); for (int i = 0; i < data.Size(); i++) { IDatum <L, F> d = data.GetRVFDatum(i); ICounter <L> scores = classifier.LogProbabilityOf(d); int labelD = d.Label().Equals(posLabel) ? 1 : 0; dataScores.Add(new Pair <double, int>(Math.Exp(scores.GetCount(posLabel)), labelD)); } PRCurve prc = new PRCurve(dataScores); confWeightedAccuracy = prc.Cwa(); accuracy = prc.Accuracy(); optAccuracy = prc.OptimalAccuracy(); optConfWeightedAccuracy = prc.OptimalCwa(); logLikelihood = prc.LogLikelihood(); accrecall = prc.CwaArray(); optaccrecall = prc.OptimalCwaArray(); return(accuracy); }
public MultiClassChunkEvalStats(IClassifier <string, F> classifier, GeneralDataset <string, F> data, string negLabel) : base(classifier, data, negLabel) { chunker = new LabeledChunkIdentifier(); chunker.SetNegLabel(negLabel); }
public BiasedLogConditionalObjectiveFunction(GeneralDataset dataset, double[][] confusionMatrix) : this(dataset, confusionMatrix, new LogPrior(LogPrior.LogPriorType.QUADRATIC)) { }
public MultiClassStringLabelStats(IClassifier <string, F> classifier, GeneralDataset <string, F> data, string negLabel) : base(classifier, data, negLabel) { stringConverter = new MultiClassPrecisionRecallExtendedStats.StringStringConverter(); }
public BiasedLogConditionalObjectiveFunction(GeneralDataset dataset, double[][] confusionMatrix, LogPrior prior) : this(dataset.NumFeatures(), dataset.NumClasses(), dataset.GetDataArray(), dataset.GetLabelsArray(), confusionMatrix, prior) { }
public MultiClassAccuracyStats(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data, string file, int scoreType) { saveFile = file; this.scoreType = scoreType; InitMC(classifier, data); }
/// <summary>A helper function for dumping the accuracy of the trained classifier.</summary> /// <param name="classifier">The classifier to evaluate.</param> /// <param name="dataset">The dataset to evaluate the classifier on.</param> public static void DumpAccuracy(IClassifier <ClauseSplitter.ClauseClassifierLabel, string> classifier, GeneralDataset <ClauseSplitter.ClauseClassifierLabel, string> dataset) { DecimalFormat df = new DecimalFormat("0.00%"); Redwood.Log("size: " + dataset.Size()); Redwood.Log("split count: " + StreamSupport.Stream(dataset.Spliterator(), false).Filter(null).Collect(Collectors.ToList()).Count); Redwood.Log("interm count: " + StreamSupport.Stream(dataset.Spliterator(), false).Filter(null).Collect(Collectors.ToList()).Count); Pair <double, double> pr = classifier.EvaluatePrecisionAndRecall(dataset, ClauseSplitter.ClauseClassifierLabel.ClauseSplit); Redwood.Log("p (split): " + df.Format(pr.first)); Redwood.Log("r (split): " + df.Format(pr.second)); Redwood.Log("f1 (split): " + df.Format(2 * pr.first * pr.second / (pr.first + pr.second))); pr = classifier.EvaluatePrecisionAndRecall(dataset, ClauseSplitter.ClauseClassifierLabel.ClauseInterm); Redwood.Log("p (interm): " + df.Format(pr.first)); Redwood.Log("r (interm): " + df.Format(pr.second)); Redwood.Log("f1 (interm): " + df.Format(2 * pr.first * pr.second / (pr.first + pr.second))); }
public virtual double Score <F>(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data) { return(Score((IClassifier <L, F>)classifier, data)); }
public MultiClassPrecisionRecallStats(IClassifier <L, F> classifier, GeneralDataset <L, F> data, L negLabel) { this.negLabel = negLabel; Score(classifier, data); }
public FeaturesData(IDictionary <int, Pair <int, int> > mapQuoteToDataRange, IDictionary <int, Sieve.MentionData> mapDatumToMention, GeneralDataset <string, string> dataset) { this.mapQuoteToDataRange = mapQuoteToDataRange; this.mapDatumToMention = mapDatumToMention; this.dataset = dataset; }
public AccuracyStats(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data, L posLabel) { // = null; this.posLabel = posLabel; Score(classifier, data); }
public MultiClassAccuracyStats(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data, string file) : this(classifier, data, file, UseAccuracy) { }
public MultiClassPrecisionRecallExtendedStats(IClassifier <L, F> classifier, GeneralDataset <L, F> data, L negLabel) : base(classifier, data, negLabel) { }
public virtual double Score <F>(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data) { InitMC(classifier, data); return(Score()); }
public ExtractQuotesClassifier(GeneralDataset <string, string> trainingSet) { LinearClassifierFactory <string, string> lcf = new LinearClassifierFactory <string, string>(); quoteToMentionClassifier = lcf.TrainClassifier(trainingSet); }