/// <summary> /// Constructs an active learning instance with a specified data set and model instance. /// </summary> /// <param name="data">The data.</param> /// <param name="model">The model instance.</param> /// <param name="results">The results instance.</param> /// <param name="numCommunities">The number of communities (only for CBCC).</param> public ActiveLearning(IList<Datum> data, BCC model, Results results, int numCommunities) { this.bcc = model; CBCC communityModel = model as CBCC; IsCommunityModel = (communityModel != null); ActiveLearningResults = results; BatchResults = results; isExperimentCompleted = false; // Builds the full matrix of data from every task and worker PredictionData = new List<Datum>(); }
/// <summary> /// Runs a model with the full gold set. /// </summary> /// <param name="dataSet">The dataset name.</param> /// <param name="data">The data.</param> /// <param name="runType">The model run type.</param> /// <param name="model">The model instance.</param> /// <param name="numCommunities">The number of communities (only for CBCC).</param> /// <returns>The inference results</returns> public static Results RunGold(string dataSet, IList<Datum> data, RunType runType, BCC model, int numCommunities = 2) { string modelName = Program.GetModelName(dataSet, runType); Results results = new Results(); switch (runType) { case RunType.VoteDistribution: results.RunMajorityVote(data, data, true, true); break; case RunType.MajorityVote: results.RunMajorityVote(data, data, true, false); break; case RunType.DawidSkene: results.RunDawidSkene(data, data, true); break; default: results.RunBCC(modelName, data, data, model, RunMode.ClearResults, true, numCommunities, false, false); break; } return results; }
/// <summary> /// Updates the active learning results object. /// </summary> /// <param name="results">The new results</param> public void UpdateActiveLearningResults(Results results) { ActiveLearningResults = results; }
/// <summary> /// Saves the results of the inference and the model's parameters on csv files. /// </summary> /// <param name="accuracy">The list of accuracies evaluated on the gold labels at each active learning round.</param> /// <param name="avgRecall">The list of average recalls evaluated on the gold labels at each active learning round.</param> /// <param name="taskValue">The list of utilities of the task selected at each active learning round.</param> /// <param name="results">The result instance.</param> /// <param name="modelName">The model name.</param> /// <param name="suffix">The suffix of the csv files.</param> /// <param name="resultsDir">The directory to store the csv files.</param> /// <param name="projectInitialNumLabelsPerTask">The initial number of exploratory labels for each task.</param> public static void DoSnapshot(List<double> accuracy, List<double> avgRecall, List<ActiveLearningResult> taskValue, Results results, string modelName, string suffix, string resultsDir, int projectInitialNumLabelsPerTask) { suffix = suffix == "final" ? "" : suffix; String new_graph_csv_file_name = String.Format("{2}{0}__graph_{1}_InitialLabels_{3}.csv", modelName, suffix, resultsDir, projectInitialNumLabelsPerTask); using (StreamWriter writer = new StreamWriter(new_graph_csv_file_name)) { var accArr = accuracy.ToArray(); var avgRec = avgRecall.ToArray(); writer.WriteLine("Accuracy,AvgRecall"); for (int i = 0; i < accArr.Length; i++) { // Edit this print line to get the accuracy printed if the format that you want. writer.WriteLine("{0:0.0000},{1:0.0000}", accArr[i], avgRec[i]); // Accuracy and average recall } } }
/// <summary> /// Runs the standard active learning procedure on a model instance and an input data set. /// </summary> /// <param name="data">The data.</param> /// <param name="modelName">The model name.</param> /// <param name="runType">The model run type.</param> /// <param name="model">The model instance.</param> /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param> /// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param> /// <param name="resultsDir">The directory to save the log files.</param> /// <param name="communityCount">The number of communities (only for CBCC).</param> /// <param name="initialNumLabelsPerTask">The initial number of exploratory labels that are randomly selected for each task.</param> /// <param name="numIncremData">The number of data points to add at each round.</param> public static void RunActiveLearning( IList<Datum> data, string modelName, RunType runType, BCC model, TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod, string resultsDir, int communityCount = -1, int initialNumLabelsPerTask = 1, int numIncremData = 1) { //Count elapsed time Stopwatch stopWatchTotal = new Stopwatch(); stopWatchTotal.Start(); int totalLabels = data.Count(); // Dictionary keyed by task Id, with randomly order labelings var groupedRandomisedData = data.GroupBy(d => d.TaskId). Select(g => { var arr = g.ToArray(); int cnt = arr.Length; var perm = Rand.Perm(cnt); return new { key = g.Key, arr = g.Select((t, i) => arr[perm[i]]).ToArray() }; }).ToDictionary(a => a.key, a => a.arr); // Dictionary keyed by task Id, with label counts Dictionary<string, int> totalCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Length); // Keyed by task, value is a HashSet containing all the remaining workers with a label - workers are removed after adding a new datum Dictionary<string, HashSet<string>> remainingWorkersPerTask = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => new HashSet<string>(kvp.Value.Select(dat => dat.WorkerId))); int numTaskIds = totalCounts.Count(); int totalInstances = initialNumLabelsPerTask > 0 ? data.Count - initialNumLabelsPerTask * numTaskIds : data.Count - numIncremData; //throw an exception if the totalInstances is less than or equals to zero if (totalInstances <= 0) { throw new System.Exception("The variable 'totalInstances' should be greater than zero"); } string[] WorkerIds = data.Select(d => d.WorkerId).Distinct().ToArray(); //only creat accuracy list when it's null (for GUI Use) if (accuracy == null) { accuracy = new List<double>(); } List<double> avgRecall = new List<double>(); //List<ActiveLearningResult> taskValueList = new List<ActiveLearningResult>(); taskValueList = new List<ActiveLearningResult>(); int index = 0; Console.WriteLine("Active Learning: {0}", modelName); Console.WriteLine("\t\t\t\t\t\tAcc\tAvgRec"); // Get initial data Results results = new Results(); Dictionary<string, int> currentCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => initialNumLabelsPerTask); List<Datum> subData = GetSubdata(groupedRandomisedData, currentCounts, remainingWorkersPerTask); var s = remainingWorkersPerTask.Select(w => w.Value.Count).Sum(); List<Datum> nextData = null; ActiveLearning activeLearning = null; isExperimentCompleted = false; for (int iter = 0; ; iter++) //run until data run out { bool calculateAccuracy = true; bool doSnapShot = iter % 1 == 0; if (subData != null || nextData != null) { switch (runType) { case RunType.VoteDistribution: results.RunMajorityVote(subData, data, calculateAccuracy, true); break; case RunType.MajorityVote: results.RunMajorityVote(subData, data, calculateAccuracy, false); break; case RunType.DawidSkene: results.RunDawidSkene(subData, data, calculateAccuracy); break; default: // Run BCC models results.RunBCC(modelName, subData, data, model, RunMode.ClearResults, calculateAccuracy, communityCount, false); break; } } if (activeLearning == null) { activeLearning = new ActiveLearning(data, model, results, communityCount); } else { activeLearning.UpdateActiveLearningResults(results); } // We create a list of task utilities // TaskValue: Dictionary keyed by task, the value is an active learning result. Dictionary<string, ActiveLearningResult> TaskUtility = null; switch (taskSelectionMethod) { case TaskSelectionMethod.EntropyTask: TaskUtility = activeLearning.EntropyTrueLabel(); break; case TaskSelectionMethod.RandomTask: TaskUtility = data.GroupBy(d => d.TaskId).ToDictionary(a => a.Key, a => new ActiveLearningResult { TaskValue = Rand.Double() }); break; case TaskSelectionMethod.UniformTask: // Reproduce uniform task selection by picking the task with the lowest number of current labels. That is, minus the current count. TaskUtility = currentCounts.OrderBy(kvp => kvp.Value).ToDictionary(a => a.Key, a => new ActiveLearningResult { TaskId = a.Key, TaskValue = -a.Value }); break; default: TaskUtility = activeLearning.EntropyTrueLabel(); break; } // We create a list of worker utilities. Dictionary<string, double> WorkerAccuracy = null; // Best worker selection is only allowed for methods that infer worker confusion matrices. if (results.WorkerConfusionMatrix == null) workerSelectionMethod = WorkerSelectionMethod.RandomWorker; switch (workerSelectionMethod) { case WorkerSelectionMethod.BestWorker: // Assign worker accuracies to the maximum value on the diagonal of the confusion matrix (conservative approach). // Alternative ways are also possible. WorkerAccuracy = results.WorkerConfusionMatrixMean.ToDictionary( kvp => kvp.Key, kvp => Results.GetConfusionMatrixDiagonal(kvp.Value).Max()); break; case WorkerSelectionMethod.RandomWorker: // Assign worker accuracies to random values WorkerAccuracy = results.FullMapping.WorkerIdToIndex.ToDictionary(kvp => kvp.Key, kvp => Rand.Double()); break; default: throw new ApplicationException("No worker selection method selected"); } // Create a list of tuples (TaskIds, WorkerId, ActiveLearningResult). List<Tuple<string, string, ActiveLearningResult>> LabelValue = new List<Tuple<string,string,ActiveLearningResult>>(); foreach (var kvp in TaskUtility) { foreach (var workerId in remainingWorkersPerTask[kvp.Key]) { var labelValue = new ActiveLearningResult { WorkerId = workerId, TaskId = kvp.Key, TaskValue = kvp.Value.TaskValue, WorkerValue = WorkerAccuracy[workerId] }; LabelValue.Add(Tuple.Create(labelValue.TaskId, labelValue.WorkerId, labelValue)); } } // Increment tha active set with new data nextData = GetNextData(groupedRandomisedData, LabelValue, currentCounts, totalCounts, remainingWorkersPerTask, numIncremData); if (nextData == null || nextData.Count == 0) break; index += nextData.Count; subData.AddRange(nextData); // Logs if (calculateAccuracy) { accuracy.Add(results.Accuracy); avgRecall.Add(results.AvgRecall); if (TaskUtility == null) { var sortedLabelValue = LabelValue.OrderByDescending(kvp => kvp.Item3.TaskValue).ToArray(); taskValueList.Add(sortedLabelValue.First().Item3); } else { //Adding WorkerId into taskValueList ActiveLearningResult nextTaskValueItem = TaskUtility[nextData.First().TaskId]; nextTaskValueItem.WorkerId = nextData.First().WorkerId; //add taskID nextTaskValueItem.TaskId = nextData.First().TaskId; taskValueList.Add(nextTaskValueItem); } if (doSnapShot) { Console.WriteLine("{0} (label {1} of {2}):\t{3:0.000}\t{4:0.0000}", modelName, index, totalInstances, accuracy.Last(), avgRecall.Last()); //DoSnapshot(accuracy, nlpd, avgRecall, taskValueList, results, modelName, "interim", resultsDir, initialNumLabelsPerTask); } }//end if logs }//end for all data isExperimentCompleted = true; stopWatchTotal.Stop(); DoSnapshot(accuracy, avgRecall, taskValueList, results, modelName, "final", resultsDir, initialNumLabelsPerTask); ResetAccuracyList(); Console.WriteLine("Elapsed time: {0}\n", stopWatchTotal.Elapsed); }