private Model TrainModel(LabeledDataset <SentimentLabel, SparseVector <double> > dataset,
                                 SentimentLabel label, SentimentLabel otherLabel1, SentimentLabel otherLabel2)
        {
            IModel <SentimentLabel, SparseVector <double> > model = CreateModel();

            var otherLabelWeight1 = (double)dataset.Count(le => le.Label == otherLabel1) / dataset.Count(le => le.Label != label);
            var otherLabelWeight2 = (double)dataset.Count(le => le.Label == otherLabel2) / dataset.Count(le => le.Label != label);

            dataset = new LabeledDataset <SentimentLabel, SparseVector <double> >(dataset.Select(le =>
                                                                                                 new LabeledExample <SentimentLabel, SparseVector <double> >(le.Label == label ? label : otherLabel1, le.Example)));

            var scores       = new List <double>();
            var scoresOthers = new List <double>();
            var validation   = new CrossValidator <SentimentLabel, SparseVector <double> >
            {
                NumFolds = NumTrainFolds,
                Dataset  = dataset,

                OnAfterPrediction = (sender, foldN, m, ex, le, prediction) =>
                {
                    if (le.Label == prediction.BestClassLabel)
                    {
                        if (prediction.BestClassLabel == label)
                        {
                            scores.Add(prediction.BestScore);
                        }
                        else
                        {
                            scoresOthers.Add(prediction.BestScore);
                        }
                    }
                    return(true);
                }
            };

            validation.Models.Add(model);
            validation.Run();

            // train model
            model.Train(dataset);

            return(new Model
            {
                InnerModel = model,
                Weight = validation.PerfData.GetSumPerfMatrix(validation.ExpName, validation.GetModelName(model)).GetMacroF1(),
                Label = label,
                OtherLabel1 = otherLabel1,
                OtherLabelWeight1 = otherLabelWeight1,
                OtherLabel2 = otherLabel2,
                OtherLabelWeight2 = otherLabelWeight2,
                Scores = scores.OrderBy(s => s).ToArray(),
                ScoresOthers = scoresOthers.OrderBy(s => s).ToArray()
            });
        }
Example #2
0
        public override void Run(object[] args)
        {
            // get labeled data
            BinarySvm classifierInst = BinarySvm.RunInstanceNull(args);
            var       labeledData    = (LabeledDataset <string, SparseVector <double> >)classifierInst.Result["labeled_data"];

            // convert dataset to binary vector
            var ds = (LabeledDataset <string, BinaryVector>)labeledData.ConvertDataset(typeof(BinaryVector), false);

            // cross validation ...with the convenience class
            var validation = new CrossValidator <string, BinaryVector>
            {
                NumFolds     = 10,   // default
                IsStratified = true, // default
                ExpName      = "",   // default

                Dataset      = ds,
                OnAfterTrain = (sender, foldN, model, trainSet) =>
                {
                    var m = (NaiveBayesClassifier <string>)model;
                    // do stuff after model is trained for a fold...
                },
                OnAfterPrediction = (sender, foldN, model, ex, le, prediction) =>
                {
                    Output.WriteLine("actual: {0} \tpredicted: {1}\t score: {2:0.0000}", le.Label, prediction.BestClassLabel, prediction.BestScore);
                    return(true);
                },
                OnAfterFold = (sender, foldN, trainSet, foldPredictions) =>
                {
                    PerfMatrix <string> foldMatrix = sender.PerfData.GetPerfMatrix(sender.ExpName, sender.GetModelName(0), foldN);
                    Output.WriteLine("Accuracy for {0}-fold: {1:0.00}", foldN, foldMatrix.GetAccuracy());
                }
            };

            validation.Models.Add(new NaiveBayesClassifier <string>());
            validation.Run();

            Output.WriteLine("Sum confusion matrix:");
            PerfMatrix <string> sumPerfMatrix = validation.PerfData.GetSumPerfMatrix("", validation.GetModelName(0));

            Output.WriteLine(sumPerfMatrix.ToString());
            Output.WriteLine("Average accuracy: {0:0.00}", sumPerfMatrix.GetAccuracy());
            foreach (string label in validation.PerfData.GetLabels("", validation.GetModelName(0)))
            {
                double stdDev;
                Output.WriteLine("Precision for '{0}': {1:0.00} std. dev: {2:0.00}", label,
                                 validation.PerfData.GetAvg("", validation.GetModelName(0), ClassPerfMetric.Precision, label, out stdDev), stdDev);
            }
        }
        private Model TrainModel(LabeledDataset <SentimentLabel, SparseVector <double> > dataset,
                                 SentimentLabel label1, SentimentLabel label2)
        {
            IModel <SentimentLabel, SparseVector <double> > model = CreateModel();
            var scores1 = new List <double>();
            var scores2 = new List <double>();

            var validation = new CrossValidator <SentimentLabel, SparseVector <double> >
            {
                NumFolds = NumTrainFolds,
                Dataset  = dataset,

                OnAfterPrediction = (sender, foldN, m, ex, le, prediction) =>
                {
                    if (le.Label == prediction.BestClassLabel)
                    {
                        if (prediction.BestClassLabel == label1)
                        {
                            scores1.Add(prediction.BestScore);
                        }
                        else if (prediction.BestClassLabel == label2)
                        {
                            scores2.Add(prediction.BestScore);
                        }
                    }
                    return(true);
                }
            };

            validation.Models.Add(model);
            validation.Run();

            // train model
            model.Train(dataset);
            return(new Model
            {
                InnerModel = model,
                Label1 = label1,
                Label2 = label2,
                Scores1 = scores1.OrderBy(s => s).ToArray(),
                Scores2 = scores2.OrderBy(s => s).ToArray(),
                Weight = validation.PerfData.GetSumPerfMatrix(validation.ExpName, validation.GetModelName(model)).GetMacroF1()
            });
        }
Example #4
0
        public override void Train(ILabeledExampleCollection <SentimentLabel, SparseVector <double> > dataset)
        {
            Preconditions.CheckNotNull(dataset);
            Preconditions.CheckArgumentRange(IsCalcBounds || NegCentile >= 0 && NegCentile <= 1);
            Preconditions.CheckArgumentRange(IsCalcBounds || PosCentile >= 0 && PosCentile <= 1);

            var labeledDataset = (LabeledDataset <SentimentLabel, SparseVector <double> >)dataset;

            if (labeledDataset.Count == 0)
            {
                Console.WriteLine("empty dataset");
            }

            TrainStats = null;

            var posScores      = new List <double>();
            var negScores      = new List <double>();
            var neutralScores  = new List <double>();
            var trainDataset   = new LabeledDataset <SentimentLabel, SparseVector <double> >(labeledDataset.Where(le => le.Label != SentimentLabel.Neutral));
            var neutralDataset = IsCalcStats || IsCalcBounds
                ? new LabeledDataset <SentimentLabel, SparseVector <double> >(dataset.Where(le => le.Label == SentimentLabel.Neutral))
                : null;

            var validation = new CrossValidator <SentimentLabel, SparseVector <double> >
            {
                NumFolds = NumTrainFolds,
                Dataset  = trainDataset,

                OnAfterPrediction = (sender, foldN, model, example, le, prediction) =>
                {
                    if (le.Label == prediction.BestClassLabel)
                    {
                        if (le.Label == SentimentLabel.Positive)
                        {
                            posScores.Add(prediction.BestScore);
                        }
                        else
                        {
                            negScores.Add(-prediction.BestScore);
                        }
                    }
                    return(true);
                },

                OnAfterFold = (sender, foldN, trainSet, testSet) =>
                {
                    if (IsCalcStats || IsCalcBounds)
                    {
                        neutralScores.AddRange(neutralDataset
                                               .Select(le => sender.Models[0].Predict(le.Example))
                                               .Select(p => p.BestClassLabel == SentimentLabel.Positive ? p.BestScore : -p.BestScore));
                    }
                }
            };

            validation.Models.Add(CreateModel());
            validation.Run();

            if (IsCalcBounds)
            {
                double negMaxProb, negScore;
                NegBound = FindMaxExclusiveProbability(neutralScores.Where(s => s < 0).Select(s => - s),
                                                       negScores.Select(s => - s), out negMaxProb, out negScore) ? -negScore : 0;

                double posMaxProb, posScore;
                PosBound = FindMaxExclusiveProbability(neutralScores.Where(s => s > 0),
                                                       posScores, out posMaxProb, out posScore) ? posScore : 0;
            }
            else
            {
                if (NegCentile != null)
                {
                    NegBound = negScores.OrderByDescending(bs => bs).Skip((int)Math.Truncate(negScores.Count * NegCentile.Value)).FirstOrDefault();
                }
                if (PosCentile != null)
                {
                    PosBound = posScores.OrderBy(bs => bs).Skip((int)Math.Truncate(posScores.Count * PosCentile.Value)).FirstOrDefault();
                }
            }

            if (IsCalcStats)
            {
                TrainStats = CalcStats(negScores, neutralScores, posScores);
            }

            mBinaryClassifier = validation.Models[0];
            mBinaryClassifier.Train(trainDataset);

            IsTrained = true;
        }