/// <summary> /// Write the crowd data details to console. /// </summary> /// <param name="crowdData"> /// The crowd data. /// </param> /// <param name="dataName"> /// The data name. /// </param> public static void WriteCrowdDataDetailsToConsole(CrowdData crowdData, string dataName) { Console.WriteLine($@"{dataName}:"); Console.WriteLine($@" Number of tweets: {crowdData.NumTweets}"); Console.WriteLine($@" Number of tweets with gold labels: {crowdData.NumGoldTweets}"); Console.WriteLine($@" Number of workers: {crowdData.NumWorkers}"); Console.WriteLine($@" Number of worker labels: {crowdData.NumLabels}"); Console.WriteLine(); }
/// <summary> /// Returns the matrix of the tweet indices (columns) of each worker (rows). /// </summary> /// <param name="data">The data.</param> /// <returns>The matrix of the tweet indices (columns) of each worker (rows).</returns> public int[][] GetTweetIndicesPerWorkerIndex(CrowdData data) { var result = new int[this.WorkerCount][]; for (var i = 0; i < this.WorkerCount; i++) { var wid = this.WorkerIndexToId[i]; result[i] = data.CrowdLabels.Where(d => d.WorkerId == wid).Select(d => this.TweetIdToIndex[d.TweetId]).ToArray(); } return(result); }
/// <summary> /// Get5s the worker label count histogram. /// </summary> /// <param name="data"> /// The data. /// </param> /// <returns> /// The histogram. /// </returns> public static int?[] GetWorkerLabelCountHistogram(CrowdData data) { var labelCounts = data.CrowdLabels.GroupBy(datum => datum.WorkerId).Select(grp => grp.Count()).ToList(); var maxCount = labelCounts.Count == 0 ? 0 : labelCounts.Max(); var result = Util.ArrayInit <int?>(maxCount + 1, i => 0); foreach (var cnt in labelCounts) { result[cnt]++; } result[0] = null; return(result); }
/// <summary> /// Initializes a new instance of the <see cref="CrowdDataMapping" /> class. /// </summary> /// <param name="data"> /// The crowd data. /// </param> /// <param name="labelValueToString"> /// The mapping from label values in file to label strings. /// </param> public CrowdDataMapping( CrowdData data, Dictionary <int, string> labelValueToString) { this.WorkerIndexToId = data.WorkerIds.ToArray(); this.WorkerIdToIndex = this.WorkerIndexToId.Select((id, idx) => new KeyValuePair <string, int>(id, idx)) .ToDictionary(x => x.Key, y => y.Value); this.TweetIds = data.TweetIds.ToArray(); this.TweetIdToIndex = this.TweetIds.Select((id, idx) => new KeyValuePair <string, int>(id, idx)) .ToDictionary(x => x.Key, y => y.Value); var labelsInFile = data.CrowdLabels.Select(d => d.WorkerLabel).Distinct().ToArray(); var labelValueSet = new HashSet <int>(labelValueToString.Keys); if (!labelsInFile.All(lab => labelValueSet.Contains(lab))) { throw new ApplicationException("Unexpected labels found"); } this.LabelValueToString = labelValueToString; this.LabelIndexToValue = labelValueSet.OrderBy(lab => lab).ToArray(); this.LabelValueToIndex = this.LabelIndexToValue.Select((id, idx) => new KeyValuePair <int, int>(id, idx)) .ToDictionary(x => x.Key, y => y.Value); this.Data = data; this.DataWithGold = new CrowdData { CrowdLabels = data.CrowdLabels.Where(d => data.GoldLabels.ContainsKey(d.TweetId)) .ToList(), GoldLabels = data.GoldLabels }; // WorkerLabelAccuracy: Perc. agreement between worker label and gold label var labelSet = this.DataWithGold.CrowdLabels; var goldLabels = this.DataWithGold.GoldLabels; var numLabels = labelSet.Count(); var sumAcc = labelSet.Sum(datum => (datum.WorkerLabel == goldLabels[datum.TweetId] ? 1 : 0)); this.AverageWorkerLabelAccuracy = sumAcc / (double)numLabels; }
/// <summary> /// Gets the prominent workers for a data set - i.e. those who have given many labels. /// </summary> /// <param name="data"> /// The data. /// </param> /// <param name="maxNumberWorkers"> /// The maximum number of workers for which to get metrics. /// </param> /// <returns> /// The metrics. /// </returns> private static HashSet <string> GetProminentWorkers(CrowdData data, int maxNumberWorkers = 20) { var mapping = new CrowdDataMapping( data, LabelValuesToString); var labelCounts = new Dictionary <string, object>(); var labelsGroupedByWorker = data.CrowdLabels.GroupBy(cd => cd.WorkerId); foreach (var worker in labelsGroupedByWorker) { var workerId = worker.Key; var workerLabels = worker.Distinct(CrowdData.WorkerTweetEqualityComparer.Instance) .ToDictionary(lab => lab.TweetId, lab => lab.WorkerLabel); var workerMetrics = GetMetrics(mapping, workerLabels); labelCounts[workerId] = workerMetrics[Metric.Count]; } return(new HashSet <string>( labelCounts.OrderByDescending(kvp => (int)kvp.Value).Take(maxNumberWorkers).Select(kvp => kvp.Key))); }
/// <summary> /// Loads the data. /// </summary> /// <param name="crowdLabelsFileName"> /// The crowd labels file name, format: (tweet id, worker id, worker label). /// </param> /// <param name="goldLabelsFileName"> /// The gold labels file name, format: (tweet id, gold label). /// </param> /// <param name="textsFileName"> /// The texts File Name. /// </param> /// <param name="allowedLabels"> /// The allowed labels. /// </param> /// <returns> /// The crowd data. /// </returns> public static CrowdDataWithText LoadData( string crowdLabelsFileName, string goldLabelsFileName, string textsFileName, HashSet <int> allowedLabels) { var crowdData = CrowdData.LoadData(crowdLabelsFileName, goldLabelsFileName, allowedLabels); var texts = File.ReadLines(textsFileName).Select(line => line.Split('\t')) .ToDictionary(strarr => strarr[0], strarr => strarr[1]); var tweetSet = new HashSet <string>(crowdData.CrowdLabels.Select(cd => cd.TweetId).Distinct()); var tweetsForData = texts.Where(kvp => tweetSet.Contains(kvp.Key)) .ToDictionary(kvp => kvp.Key, kvp => kvp.Value); var result = new CrowdDataWithText { CrowdLabels = crowdData.CrowdLabels, GoldLabels = crowdData.GoldLabels, TweetTexts = tweetsForData }; result.Tweets = Tweet.FromCrowdData(result); result.Workers = Worker.FromCrowdData(result); return(result); }
/// <summary> /// Gets the worker metrics for a data set /// </summary> /// <param name="data"> /// The data. /// </param> /// <param name="runner"> /// The runner. /// </param> /// <param name="maxNumberWorkers"> /// The maximum number of workers for which to get metrics. /// </param> /// <returns> /// The metrics. /// </returns> public static Dictionary <string, Dictionary <string, object> > GetWorkerMetrics(CrowdData data, RunnerBase runner, int maxNumberWorkers = 20) { var metricsForWorkers = new List <Metric> { Metric.Count, Metric.Accuracy, Metric.ConfusionMatrix }; var result = new Dictionary <string, Dictionary <string, object> >(); var mapping = runner != null ? runner.DataMapping : new CrowdDataMapping(data, LabelValuesToString); foreach (var metric in metricsForWorkers) { result[metric.ToString()] = new Dictionary <string, object>(); } var confusionMatrixKey = Metric.ConfusionMatrix.ToString(); var confusionMatrixPercentageKey = $"{confusionMatrixKey}Percentage"; result[confusionMatrixPercentageKey] = new Dictionary <string, object>(); var labelsGroupedByWorker = data.CrowdLabels.GroupBy(cd => cd.WorkerId); var trueLabels = runner?.Predictions; const string RowLabel = PlotData.ConfusionMatrixRowLabel; const string ColumnLabel = PlotData.ConfusionMatrixColLabel; foreach (var worker in labelsGroupedByWorker) { var workerId = worker.Key; var workerLabels = worker.Distinct(CrowdData.WorkerTweetEqualityComparer.Instance) .ToDictionary(lab => lab.TweetId, lab => lab.WorkerLabel); var workerMetrics = GetMetrics(mapping, workerLabels, trueLabels); foreach (var metric in metricsForWorkers) { if (metric == Metric.ConfusionMatrix) { var mat = workerMetrics[metric]; var confMat = PlotData.GetConfusionMatrix((double[, ])mat, mapping, RowLabel, ColumnLabel); result[confusionMatrixKey][workerId] = confMat; var matPerc = PlotData.GetConfusionMatrix((double[, ])mat, mapping, RowLabel, ColumnLabel, true); result[confusionMatrixPercentageKey][workerId] = matPerc; } else { result[metric.ToString()][workerId] = workerMetrics[metric]; } } } // Limit the confusion matrices to the more prominent workers. var prominentWorkers = new HashSet <string>(result[Metric.Count.ToString()] .OrderByDescending(kvp => (int)kvp.Value).Take(maxNumberWorkers).Select(kvp => kvp.Key)); result[confusionMatrixKey] = result[confusionMatrixKey].Where(kvp1 => prominentWorkers.Contains(kvp1.Key)) .ToDictionary(kvp1 => kvp1.Key, kvp1 => kvp1.Value); result[confusionMatrixPercentageKey] = result[confusionMatrixPercentageKey] .Where(kvp1 => prominentWorkers.Contains(kvp1.Key)) .ToDictionary(kvp1 => kvp1.Key, kvp1 => kvp1.Value); return(result); }
/// <summary> /// Limits the data in various ways /// </summary> /// <param name="maxJudgements"> /// Maximum number of judgments. /// </param> /// <param name="maxNumTweets"> /// The maximum number of tweets. If less than the full number, these are chosen randomly /// except that tweets with gold labels are chosen preferentially. /// </param> /// <param name="maxNumWorkers"> /// The maximum number of workers. If less than the full number, these are chosen /// to maximize the number of labels /// </param> /// <param name="maxJudgmentsPerWorker"> /// The maximum number of judgments per worker. /// </param> /// <param name="maxJudgmentsPerTweet"> /// The maximum number of judgments per tweet. /// </param> /// <param name="balanceTweetsByLabel">Balance tweets by majority vote label. /// </param> /// <param name="randomSeed"> /// The random seed. /// </param> /// <returns> /// The reduced crowd data. /// </returns> public virtual CrowdData LimitData( int maxJudgements = int.MaxValue, int maxNumTweets = int.MaxValue, int maxNumWorkers = int.MaxValue, int maxJudgmentsPerWorker = int.MaxValue, int maxJudgmentsPerTweet = int.MaxValue, bool balanceTweetsByLabel = false, int randomSeed = 12347) { var crowdLabels = this.CrowdLabels; var goldLabels = this.GoldLabels; // Restrict the tweets if requested. Rand.Restart(randomSeed); var selectedTweets = new HashSet <string>(crowdLabels.Select(cd => cd.TweetId).Distinct()); if (selectedTweets.Count > maxNumTweets || balanceTweetsByLabel) { var goldTweets = goldLabels.Keys.ToArray(); var nonGoldTweets = selectedTweets.Except(goldTweets).ToArray(); var goldPerm = Rand.Perm(goldTweets.Length); var nonGoldPerm = Rand.Perm(nonGoldTweets.Length); var permutedGoldTweets = goldPerm.Select(i => goldTweets[i]); var permutedNonGoldTweets = nonGoldPerm.Select(i => nonGoldTweets[i]); var selectedTweetList = permutedGoldTweets.Concat(permutedNonGoldTweets).Take(maxNumTweets).ToList(); selectedTweets = new HashSet <string>(selectedTweetList); crowdLabels = crowdLabels.Where(cd => selectedTweets.Contains(cd.TweetId)).ToList(); if (balanceTweetsByLabel) { var majorityVoteLabels = CrowdData.MajorityVoteLabels(crowdLabels); var tweetCountsPerLabel = majorityVoteLabels.Values.GroupBy(lab => lab).Select(grp => grp.Count()).ToArray(); var smallestCount = tweetCountsPerLabel.Min(); selectedTweets.Clear(); var counts = Util.ArrayInit(tweetCountsPerLabel.Length, i => 0); Util.ArrayInit(tweetCountsPerLabel.Length, labVal => new List <int>()); foreach (var tweet in selectedTweetList) { var lab = majorityVoteLabels[tweet]; if (++counts[lab] <= smallestCount) { selectedTweets.Add(tweet); } } crowdLabels = crowdLabels.Where(cd => selectedTweets.Contains(cd.TweetId)).ToList(); } goldLabels = goldLabels.Where(kvp => selectedTweets.Contains(kvp.Key)) .ToDictionary(kvp => kvp.Key, kvp => kvp.Value); } // Limit workers var numWorkers = crowdLabels.Select(cd => cd.WorkerId).Distinct().Count(); if (numWorkers > maxNumWorkers) { crowdLabels = crowdLabels.GroupBy(cd => cd.WorkerId) .OrderByDescending(wcds => wcds.Count(wcd => selectedTweets.Contains(wcd.TweetId))) .Take(maxNumWorkers).SelectMany(cds => cds).ToList(); } // Limit judgments per worker if (maxJudgmentsPerWorker < int.MaxValue) { crowdLabels = crowdLabels.GroupBy(cd => cd.WorkerId).Select( gp => { var judgments = gp.ToArray(); var perm = Rand.Perm(gp.Count()); var limitCount = Math.Min(perm.Length, maxJudgmentsPerWorker); var limitedJudgments = Util.ArrayInit(limitCount, i => judgments[perm[i]]); return(limitedJudgments); }).SelectMany(cds => cds).ToList(); } // Limit judgments per tweet if (maxJudgmentsPerTweet < int.MaxValue) { crowdLabels = crowdLabels.GroupBy(cd => cd.TweetId).Select( gp => { var judgments = gp.ToArray(); var perm = Rand.Perm(gp.Count()); var limitCount = Math.Min(perm.Length, maxJudgmentsPerTweet); var limitedJudgments = Util.ArrayInit(limitCount, i => judgments[perm[i]]); return(limitedJudgments); }).SelectMany(cds => cds).ToList(); } // Limit the total judgments if (maxJudgements < int.MaxValue) { var numJudgments = crowdLabels.Count; var perm = Rand.Perm(numJudgments); var limitedNumJudgments = Math.Min(numJudgments, maxJudgements); crowdLabels = Util.ArrayInit(limitedNumJudgments, i => crowdLabels[perm[i]]).ToList(); } return(new CrowdData { CrowdLabels = crowdLabels, GoldLabels = goldLabels }); }