Exemple #1
0
 /// <summary>
 ///     Write the crowd data details to console.
 /// </summary>
 /// <param name="crowdData">
 ///     The crowd data.
 /// </param>
 /// <param name="dataName">
 ///     The data name.
 /// </param>
 public static void WriteCrowdDataDetailsToConsole(CrowdData crowdData, string dataName)
 {
     Console.WriteLine($@"{dataName}:");
     Console.WriteLine($@"  Number of tweets: {crowdData.NumTweets}");
     Console.WriteLine($@"  Number of tweets with gold labels: {crowdData.NumGoldTweets}");
     Console.WriteLine($@"  Number of workers: {crowdData.NumWorkers}");
     Console.WriteLine($@"  Number of worker labels: {crowdData.NumLabels}");
     Console.WriteLine();
 }
        /// <summary>
        /// Returns the matrix of the tweet indices (columns) of each worker (rows).
        /// </summary>
        /// <param name="data">The data.</param>
        /// <returns>The matrix of the tweet indices (columns) of each worker (rows).</returns>
        public int[][] GetTweetIndicesPerWorkerIndex(CrowdData data)
        {
            var result = new int[this.WorkerCount][];

            for (var i = 0; i < this.WorkerCount; i++)
            {
                var wid = this.WorkerIndexToId[i];
                result[i] = data.CrowdLabels.Where(d => d.WorkerId == wid).Select(d => this.TweetIdToIndex[d.TweetId]).ToArray();
            }

            return(result);
        }
Exemple #3
0
        /// <summary>
        ///     Get5s the worker label count histogram.
        /// </summary>
        /// <param name="data">
        ///     The data.
        /// </param>
        /// <returns>
        ///     The histogram.
        /// </returns>
        public static int?[] GetWorkerLabelCountHistogram(CrowdData data)
        {
            var labelCounts = data.CrowdLabels.GroupBy(datum => datum.WorkerId).Select(grp => grp.Count()).ToList();
            var maxCount    = labelCounts.Count == 0 ? 0 : labelCounts.Max();
            var result      = Util.ArrayInit <int?>(maxCount + 1, i => 0);

            foreach (var cnt in labelCounts)
            {
                result[cnt]++;
            }

            result[0] = null;

            return(result);
        }
        /// <summary>
        /// Initializes a new instance of the <see cref="CrowdDataMapping" /> class.
        /// </summary>
        /// <param name="data">
        /// The crowd data.
        /// </param>
        /// <param name="labelValueToString">
        /// The mapping from label values in file to label strings.
        /// </param>
        public CrowdDataMapping(
            CrowdData data,
            Dictionary <int, string> labelValueToString)
        {
            this.WorkerIndexToId = data.WorkerIds.ToArray();
            this.WorkerIdToIndex = this.WorkerIndexToId.Select((id, idx) => new KeyValuePair <string, int>(id, idx))
                                   .ToDictionary(x => x.Key, y => y.Value);
            this.TweetIds       = data.TweetIds.ToArray();
            this.TweetIdToIndex = this.TweetIds.Select((id, idx) => new KeyValuePair <string, int>(id, idx))
                                  .ToDictionary(x => x.Key, y => y.Value);
            var labelsInFile = data.CrowdLabels.Select(d => d.WorkerLabel).Distinct().ToArray();

            var labelValueSet = new HashSet <int>(labelValueToString.Keys);

            if (!labelsInFile.All(lab => labelValueSet.Contains(lab)))
            {
                throw new ApplicationException("Unexpected labels found");
            }

            this.LabelValueToString = labelValueToString;
            this.LabelIndexToValue  = labelValueSet.OrderBy(lab => lab).ToArray();
            this.LabelValueToIndex  = this.LabelIndexToValue.Select((id, idx) => new KeyValuePair <int, int>(id, idx))
                                      .ToDictionary(x => x.Key, y => y.Value);

            this.Data         = data;
            this.DataWithGold = new CrowdData
            {
                CrowdLabels =
                    data.CrowdLabels.Where(d => data.GoldLabels.ContainsKey(d.TweetId))
                    .ToList(),
                GoldLabels = data.GoldLabels
            };


            // WorkerLabelAccuracy: Perc. agreement between worker label and gold label
            var labelSet   = this.DataWithGold.CrowdLabels;
            var goldLabels = this.DataWithGold.GoldLabels;
            var numLabels  = labelSet.Count();
            var sumAcc     = labelSet.Sum(datum => (datum.WorkerLabel == goldLabels[datum.TweetId] ? 1 : 0));

            this.AverageWorkerLabelAccuracy = sumAcc / (double)numLabels;
        }
Exemple #5
0
        /// <summary>
        ///     Gets the prominent workers for a data set - i.e. those who have given many labels.
        /// </summary>
        /// <param name="data">
        ///     The data.
        /// </param>
        /// <param name="maxNumberWorkers">
        ///     The maximum number of workers for which to get metrics.
        /// </param>
        /// <returns>
        ///     The metrics.
        /// </returns>
        private static HashSet <string> GetProminentWorkers(CrowdData data, int maxNumberWorkers = 20)
        {
            var mapping = new CrowdDataMapping(
                data,
                LabelValuesToString);

            var labelCounts = new Dictionary <string, object>();

            var labelsGroupedByWorker = data.CrowdLabels.GroupBy(cd => cd.WorkerId);

            foreach (var worker in labelsGroupedByWorker)
            {
                var workerId     = worker.Key;
                var workerLabels = worker.Distinct(CrowdData.WorkerTweetEqualityComparer.Instance)
                                   .ToDictionary(lab => lab.TweetId, lab => lab.WorkerLabel);
                var workerMetrics = GetMetrics(mapping, workerLabels);
                labelCounts[workerId] = workerMetrics[Metric.Count];
            }

            return(new HashSet <string>(
                       labelCounts.OrderByDescending(kvp => (int)kvp.Value).Take(maxNumberWorkers).Select(kvp => kvp.Key)));
        }
Exemple #6
0
        /// <summary>
        /// Loads the data.
        /// </summary>
        /// <param name="crowdLabelsFileName">
        /// The crowd labels file name, format: (tweet id, worker id, worker label).
        /// </param>
        /// <param name="goldLabelsFileName">
        /// The gold labels file name, format:  (tweet id, gold label).
        /// </param>
        /// <param name="textsFileName">
        /// The texts File Name.
        /// </param>
        /// <param name="allowedLabels">
        /// The allowed labels.
        /// </param>
        /// <returns>
        /// The crowd data.
        /// </returns>
        public static CrowdDataWithText LoadData(
            string crowdLabelsFileName,
            string goldLabelsFileName,
            string textsFileName,
            HashSet <int> allowedLabels)
        {
            var crowdData = CrowdData.LoadData(crowdLabelsFileName, goldLabelsFileName, allowedLabels);

            var texts = File.ReadLines(textsFileName).Select(line => line.Split('\t'))
                        .ToDictionary(strarr => strarr[0], strarr => strarr[1]);

            var tweetSet      = new HashSet <string>(crowdData.CrowdLabels.Select(cd => cd.TweetId).Distinct());
            var tweetsForData = texts.Where(kvp => tweetSet.Contains(kvp.Key))
                                .ToDictionary(kvp => kvp.Key, kvp => kvp.Value);

            var result = new CrowdDataWithText {
                CrowdLabels = crowdData.CrowdLabels, GoldLabels = crowdData.GoldLabels, TweetTexts = tweetsForData
            };

            result.Tweets  = Tweet.FromCrowdData(result);
            result.Workers = Worker.FromCrowdData(result);
            return(result);
        }
Exemple #7
0
        /// <summary>
        ///     Gets the worker metrics for a data set
        /// </summary>
        /// <param name="data">
        ///     The data.
        /// </param>
        /// <param name="runner">
        ///     The runner.
        /// </param>
        /// <param name="maxNumberWorkers">
        ///     The maximum number of workers for which to get metrics.
        /// </param>
        /// <returns>
        ///     The metrics.
        /// </returns>
        public static Dictionary <string, Dictionary <string, object> > GetWorkerMetrics(CrowdData data,
                                                                                         RunnerBase runner, int maxNumberWorkers = 20)
        {
            var metricsForWorkers = new List <Metric> {
                Metric.Count, Metric.Accuracy, Metric.ConfusionMatrix
            };
            var result  = new Dictionary <string, Dictionary <string, object> >();
            var mapping = runner != null ? runner.DataMapping : new CrowdDataMapping(data, LabelValuesToString);

            foreach (var metric in metricsForWorkers)
            {
                result[metric.ToString()] = new Dictionary <string, object>();
            }

            var confusionMatrixKey           = Metric.ConfusionMatrix.ToString();
            var confusionMatrixPercentageKey = $"{confusionMatrixKey}Percentage";

            result[confusionMatrixPercentageKey] = new Dictionary <string, object>();

            var          labelsGroupedByWorker = data.CrowdLabels.GroupBy(cd => cd.WorkerId);
            var          trueLabels            = runner?.Predictions;
            const string RowLabel    = PlotData.ConfusionMatrixRowLabel;
            const string ColumnLabel = PlotData.ConfusionMatrixColLabel;

            foreach (var worker in labelsGroupedByWorker)
            {
                var workerId     = worker.Key;
                var workerLabels = worker.Distinct(CrowdData.WorkerTweetEqualityComparer.Instance)
                                   .ToDictionary(lab => lab.TweetId, lab => lab.WorkerLabel);
                var workerMetrics = GetMetrics(mapping, workerLabels, trueLabels);

                foreach (var metric in metricsForWorkers)
                {
                    if (metric == Metric.ConfusionMatrix)
                    {
                        var mat     = workerMetrics[metric];
                        var confMat = PlotData.GetConfusionMatrix((double[, ])mat, mapping, RowLabel, ColumnLabel);
                        result[confusionMatrixKey][workerId] = confMat;
                        var matPerc =
                            PlotData.GetConfusionMatrix((double[, ])mat, mapping, RowLabel, ColumnLabel, true);
                        result[confusionMatrixPercentageKey][workerId] = matPerc;
                    }
                    else
                    {
                        result[metric.ToString()][workerId] = workerMetrics[metric];
                    }
                }
            }

            // Limit the confusion matrices to the more prominent workers.
            var prominentWorkers = new HashSet <string>(result[Metric.Count.ToString()]
                                                        .OrderByDescending(kvp => (int)kvp.Value).Take(maxNumberWorkers).Select(kvp => kvp.Key));

            result[confusionMatrixKey] = result[confusionMatrixKey].Where(kvp1 => prominentWorkers.Contains(kvp1.Key))
                                         .ToDictionary(kvp1 => kvp1.Key, kvp1 => kvp1.Value);
            result[confusionMatrixPercentageKey] = result[confusionMatrixPercentageKey]
                                                   .Where(kvp1 => prominentWorkers.Contains(kvp1.Key))
                                                   .ToDictionary(kvp1 => kvp1.Key, kvp1 => kvp1.Value);

            return(result);
        }
Exemple #8
0
        /// <summary>
        /// Limits the data in various ways
        /// </summary>
        /// <param name="maxJudgements">
        /// Maximum number of judgments.
        /// </param>
        /// <param name="maxNumTweets">
        /// The maximum number of tweets. If less than the full number, these are chosen randomly
        /// except that tweets with gold labels are chosen preferentially.
        /// </param>
        /// <param name="maxNumWorkers">
        /// The maximum number of workers. If less than the full number, these are chosen
        /// to maximize the number of labels
        /// </param>
        /// <param name="maxJudgmentsPerWorker">
        /// The maximum number of judgments per worker.
        /// </param>
        /// <param name="maxJudgmentsPerTweet">
        /// The maximum number of judgments per tweet.
        /// </param>
        /// <param name="balanceTweetsByLabel">Balance tweets by majority vote label.
        /// </param>
        /// <param name="randomSeed">
        /// The random seed.
        /// </param>
        /// <returns>
        /// The reduced crowd data.
        /// </returns>
        public virtual CrowdData LimitData(
            int maxJudgements         = int.MaxValue,
            int maxNumTweets          = int.MaxValue,
            int maxNumWorkers         = int.MaxValue,
            int maxJudgmentsPerWorker = int.MaxValue,
            int maxJudgmentsPerTweet  = int.MaxValue,
            bool balanceTweetsByLabel = false,
            int randomSeed            = 12347)
        {
            var crowdLabels = this.CrowdLabels;
            var goldLabels  = this.GoldLabels;

            // Restrict the tweets if requested.
            Rand.Restart(randomSeed);
            var selectedTweets = new HashSet <string>(crowdLabels.Select(cd => cd.TweetId).Distinct());

            if (selectedTweets.Count > maxNumTweets || balanceTweetsByLabel)
            {
                var goldTweets    = goldLabels.Keys.ToArray();
                var nonGoldTweets = selectedTweets.Except(goldTweets).ToArray();

                var goldPerm    = Rand.Perm(goldTweets.Length);
                var nonGoldPerm = Rand.Perm(nonGoldTweets.Length);

                var permutedGoldTweets    = goldPerm.Select(i => goldTweets[i]);
                var permutedNonGoldTweets = nonGoldPerm.Select(i => nonGoldTweets[i]);
                var selectedTweetList     = permutedGoldTweets.Concat(permutedNonGoldTweets).Take(maxNumTweets).ToList();
                selectedTweets = new HashSet <string>(selectedTweetList);
                crowdLabels    = crowdLabels.Where(cd => selectedTweets.Contains(cd.TweetId)).ToList();

                if (balanceTweetsByLabel)
                {
                    var majorityVoteLabels  = CrowdData.MajorityVoteLabels(crowdLabels);
                    var tweetCountsPerLabel = majorityVoteLabels.Values.GroupBy(lab => lab).Select(grp => grp.Count()).ToArray();
                    var smallestCount       = tweetCountsPerLabel.Min();
                    selectedTweets.Clear();

                    var counts = Util.ArrayInit(tweetCountsPerLabel.Length, i => 0);

                    Util.ArrayInit(tweetCountsPerLabel.Length, labVal => new List <int>());
                    foreach (var tweet in selectedTweetList)
                    {
                        var lab = majorityVoteLabels[tweet];
                        if (++counts[lab] <= smallestCount)
                        {
                            selectedTweets.Add(tweet);
                        }
                    }

                    crowdLabels = crowdLabels.Where(cd => selectedTweets.Contains(cd.TweetId)).ToList();
                }

                goldLabels = goldLabels.Where(kvp => selectedTweets.Contains(kvp.Key))
                             .ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
            }

            // Limit workers
            var numWorkers = crowdLabels.Select(cd => cd.WorkerId).Distinct().Count();

            if (numWorkers > maxNumWorkers)
            {
                crowdLabels = crowdLabels.GroupBy(cd => cd.WorkerId)
                              .OrderByDescending(wcds => wcds.Count(wcd => selectedTweets.Contains(wcd.TweetId)))
                              .Take(maxNumWorkers).SelectMany(cds => cds).ToList();
            }

            // Limit judgments per worker
            if (maxJudgmentsPerWorker < int.MaxValue)
            {
                crowdLabels = crowdLabels.GroupBy(cd => cd.WorkerId).Select(
                    gp =>
                {
                    var judgments        = gp.ToArray();
                    var perm             = Rand.Perm(gp.Count());
                    var limitCount       = Math.Min(perm.Length, maxJudgmentsPerWorker);
                    var limitedJudgments = Util.ArrayInit(limitCount, i => judgments[perm[i]]);
                    return(limitedJudgments);
                }).SelectMany(cds => cds).ToList();
            }

            // Limit judgments per tweet
            if (maxJudgmentsPerTweet < int.MaxValue)
            {
                crowdLabels = crowdLabels.GroupBy(cd => cd.TweetId).Select(
                    gp =>
                {
                    var judgments        = gp.ToArray();
                    var perm             = Rand.Perm(gp.Count());
                    var limitCount       = Math.Min(perm.Length, maxJudgmentsPerTweet);
                    var limitedJudgments = Util.ArrayInit(limitCount, i => judgments[perm[i]]);
                    return(limitedJudgments);
                }).SelectMany(cds => cds).ToList();
            }

            // Limit the total judgments
            if (maxJudgements < int.MaxValue)
            {
                var numJudgments        = crowdLabels.Count;
                var perm                = Rand.Perm(numJudgments);
                var limitedNumJudgments = Math.Min(numJudgments, maxJudgements);
                crowdLabels = Util.ArrayInit(limitedNumJudgments, i => crowdLabels[perm[i]]).ToList();
            }

            return(new CrowdData {
                CrowdLabels = crowdLabels, GoldLabels = goldLabels
            });
        }