private static string GetModelName(string dataset, RunType runType, TaskSelectionMethod taskSelectionMetric, WorkerSelectionMethod workerSelectionMetric, bool online, int taskSamples = -1, int workerSamples = -1, int numCommunities = -1) { return(dataset + "_" + Enum.GetName(typeof(RunType), runType) + "_" + Enum.GetName(typeof(TaskSelectionMethod), taskSelectionMetric) + "_" + Enum.GetName(typeof(WorkerSelectionMethod), workerSelectionMetric) + (online ? "Online" : "") + (taskSamples > 0 ? "_T" + taskSamples.ToString() : "") + (workerSamples > 0 ? "_W" + workerSamples.ToString() : "") + (numCommunities > 0 ? "_Comm" + numCommunities.ToString() : "")); }
/// <summary> /// Runs the active learning experiment presented in Venanzi et.al (WWW14) on a single data set. /// </summary> /// <param name="dataSet">The data.</param> /// <param name="runType">The model run type.</param> /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param> /// <param name="model">The model instance.</param> /// <param name="communityCount">The number of communities (only for CBCC).</param> static void RunWWWActiveLearning(string dataSet, RunType runType, TaskSelectionMethod taskSelectionMethod, BCC model, int communityCount = 4) { // Reset the random seed so results can be duplicated for the paper Rand.Restart(12347); var workerSelectionMethod = WorkerSelectionMethod.RandomWorker; var data = Datum.LoadData(@"Data\" + dataSet + ".csv"); string modelName = GetModelName(dataSet, runType, taskSelectionMethod, workerSelectionMethod, communityCount); ActiveLearning.RunActiveLearning(data, modelName, runType, model, taskSelectionMethod, workerSelectionMethod, ResultsDir, communityCount); }
/// <summary> /// Runs the active learning experiment presented in Venanzi et.al (WWW14) on a single data set. /// </summary> /// <param name="dataSet">The data.</param> /// <param name="runType">The model run type.</param> /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param> /// <param name="model">The model instance.</param> /// <param name="communityCount">The number of communities (only for CBCC).</param> public static void RunHCOMPActiveLearning(string dataSet, RunType runType, TaskSelectionMethod taskSelectionMethod, int InitialNumLabelsPerTask, BCC model, int communityCount = 4) { var data = Datum.LoadData(@"Data/" + dataSet + ".csv"); string modelName = Program.GetModelName(dataSet, runType, taskSelectionMethod, WorkerSelectionMethod.RandomWorker); //initial Number of Label Per Task //int initialNumLabelsPerTask = 1; int initialNumLabelsPerTask = InitialNumLabelsPerTask; ActiveLearning.RunActiveLearning(data, modelName, runType, model, taskSelectionMethod, WorkerSelectionMethod.RandomWorker, ResultsDir, communityCount, initialNumLabelsPerTask); }
} //End AddModel /// <summary> /// Initial the experimentItem according to the previous setting /// </summary> /// <param name="currentRunType"></param> /// <param name="currentTaskSelectionMethod"></param> /// <param name="currentWorkerSelectionMethod"></param> /// <param name="labelStartingPoints"></param> /// <param name="totalNumberOfLabels"></param> /// <returns></returns> private ExperimentModel getExperimentItem(RunType currentRunType, TaskSelectionMethod currentTaskSelectionMethod, WorkerSelectionMethod currentWorkerSelectionMethod, int[] labelStartingPoints) { //if the RunType is MajorityVote, no TaskSelectionMethods would be selected if (currentRunType == RunType.MajorityVote) { return(new ExperimentModel(currentTaskSelectionMethod, WorkerSelectionMethod.RandomWorker, currentRunType, 1, labelStartingPoints[0])); } //if it is an entropy task, add the different labelling rounds if (currentTaskSelectionMethod == TaskSelectionMethod.EntropyTask) { int currentLabellingRound = trackBarNumberOfLabellingRounds.Value; return(new ExperimentModel(currentTaskSelectionMethod, currentWorkerSelectionMethod, currentRunType, currentLabellingRound, labelStartingPoints[currentLabellingRound - 1])); } else//other taskSelectionMethods, or empty in the batch running { return(new ExperimentModel(currentTaskSelectionMethod, currentWorkerSelectionMethod, currentRunType, 1, labelStartingPoints[0])); }//end if }
} //End AddModel /// <summary> /// Initial the experimentItem according to the previous setting /// </summary> /// <param name="currentRunType"></param> /// <param name="currentTaskSelectionMethod"></param> /// <param name="currentWorkerSelectionMethod"></param> /// <param name="labelStartingPoints"></param> /// <param name="totalNumberOfLabels"></param> /// <returns></returns> private ExperimentModel getExperimentItem(RunType currentRunType, TaskSelectionMethod currentTaskSelectionMethod, WorkerSelectionMethod currentWorkerSelectionMethod, int[] labelStartingPoints) { //if the RunType is MajorityVote, no TaskSelectionMethods would be selected if (currentRunType == RunType.MajorityVote) { return new ExperimentModel(currentTaskSelectionMethod, WorkerSelectionMethod.RandomWorker, currentRunType, 1, labelStartingPoints[0]); } //if it is an entropy task, add the different labelling rounds if (currentTaskSelectionMethod == TaskSelectionMethod.EntropyTask) { int currentLabellingRound = trackBarNumberOfLabellingRounds.Value; return new ExperimentModel(currentTaskSelectionMethod, currentWorkerSelectionMethod, currentRunType, currentLabellingRound, labelStartingPoints[currentLabellingRound - 1]); } else//other taskSelectionMethods, or empty in the batch running { return new ExperimentModel(currentTaskSelectionMethod, currentWorkerSelectionMethod, currentRunType, 1, labelStartingPoints[0]); }//end if }
/// <summary> /// Runs the standard active learning procedure in parallel on an array of model instances and an input data set. /// </summary> /// <param name="data">The data.</param> /// <param name="modelName">The model name.</param> /// <param name="runType">The model run type.</param> /// <param name="model">The model instance.</param> /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param> /// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param> /// <param name="numIncremData">The number of data points to add at each iteration.</param> /// <param name="communityCount">The number of communities (only for CBCC).</param> /// <param name="initialNumLabelsPerTask">The initial number of exploratory labels that are randomly selected for each task.</param> public static void RunParallelActiveLearning(IList<Datum> data, string[] modelName, RunType[] runType, BCC[] model, TaskSelectionMethod[] taskSelectionMethod, WorkerSelectionMethod[] workerSelectionMethod, int communityCount = -1, int initialNumLabelsPerTask = 1, int numIncremData = 1) { int numModels = runType.Length; Stopwatch stopWatch = new Stopwatch(); int totalLabels = data.Count(); // Dictionary keyed by task Id, with randomly order labelings var groupedRandomisedData = data.GroupBy(d => d.TaskId). Select(g => { var arr = g.ToArray(); int cnt = arr.Length; var perm = Rand.Perm(cnt); return new { key = g.Key, arr = g.Select((t, i) => arr[perm[i]]).ToArray() }; }).ToDictionary(a => a.key, a => a.arr); // Dictionary keyed by task Id, with label counts Dictionary<string, int> totalCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Length); Dictionary<string, int> currentCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => initialNumLabelsPerTask); // Keyed by task, value is a HashSet containing all the remaining workers with a label - workers are removed after adding a new datum Dictionary<string, HashSet<string>> remainingWorkersPerTask = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => new HashSet<string>(kvp.Value.Select(dat => dat.WorkerId))); int numTaskIds = totalCounts.Count(); int totalInstances = data.Count - initialNumLabelsPerTask * numTaskIds; //throw an exception if the totalInstances is less than or equals to zero if (totalInstances <= 0) { throw new System.Exception("The variable 'totalInstances' should be greater than zero"); } //only creates accuracy list when it's null (for GUI Use) if (accuracyArray == null) { accuracyArray = Util.ArrayInit<List<double>>(numModels, i => new List<double>()); } List<double>[] avgRecallArray = Util.ArrayInit(numModels, i => new List<double>()); taskValueListArray = Util.ArrayInit(numModels, i => new List<ActiveLearningResult>()); int[] indexArray = new int[numModels]; Debug.WriteLine("Parallel Active Learning"); Debug.WriteLine("\tModel\tAcc\tAvgRec"); // Get initial data //make the results variable be global for GUi results = Util.ArrayInit<Results>(numModels, i => new Results()); List<Datum> subData = GetSubdata(groupedRandomisedData, currentCounts, remainingWorkersPerTask); List<Datum>[] subDataArray = Util.ArrayInit<List<Datum>>(numModels, i => new List<Datum>(subData)); List<Datum>[] nextData = new List<Datum>[numModels]; ActiveLearning[] activeLearning = new ActiveLearning[numModels]; isExperimentCompleted = false; // Main loop for (int iter = 0; ; iter++) { bool calculateAccuracy = true; bool doSnapShot = iter % 100 == 0; // Frequency of snapshots //stop Active Learning if the user requests to stop if (isExperimentCompleted) { return; } // Run all the models for (int indexModel = 0; indexModel < numModels; indexModel++ ) { if (subDataArray[indexModel] != null || nextData[indexModel] != null) { switch (runType[indexModel]) { case RunType.VoteDistribution: results[indexModel].RunMajorityVote(subDataArray[indexModel], data, calculateAccuracy, true); break; case RunType.MajorityVote: results[indexModel].RunMajorityVote(subDataArray[indexModel], data, calculateAccuracy, false); break; case RunType.DawidSkene: results[indexModel].RunDawidSkene(subDataArray[indexModel], data, calculateAccuracy); break; default: // Run BCC models results[indexModel].RunBCC(modelName[indexModel], subDataArray[indexModel], data, model[indexModel], RunMode.ClearResults, calculateAccuracy, communityCount, false); break; } } //end for running all the data if (activeLearning[indexModel] == null) { activeLearning[indexModel] = new ActiveLearning(data, model[indexModel], results[indexModel], communityCount); } else { activeLearning[indexModel].UpdateActiveLearningResults(results[indexModel]); } // Select next task Dictionary<string, ActiveLearningResult> TaskUtility = new Dictionary<string, ActiveLearningResult>(); switch (taskSelectionMethod[indexModel]) { case TaskSelectionMethod.EntropyTask: TaskUtility = activeLearning[indexModel].EntropyTrueLabel(); break; case TaskSelectionMethod.RandomTask: TaskUtility = data.GroupBy(d => d.TaskId).ToDictionary(a => a.Key, a => new ActiveLearningResult { TaskValue = Rand.Double() }); break; case TaskSelectionMethod.UniformTask: //add task value according to the count left TaskUtility = currentCounts.OrderBy(kvp => kvp.Value).ToDictionary(a => a.Key, a => new ActiveLearningResult { TaskValue = 1 }); break; default: // Entropy task selection TaskUtility = activeLearning[indexModel].EntropyTrueLabel(); break; } // We create a list of worker utilities Dictionary<string, double> WorkerAccuracy = null; // Best worker selection is only allowed for methods that infer worker confusion matrices. if (results[indexModel].WorkerConfusionMatrix == null) workerSelectionMethod[indexModel] = WorkerSelectionMethod.RandomWorker; switch (workerSelectionMethod[indexModel]) { case WorkerSelectionMethod.BestWorker: // Assign worker accuracies to the maximum value on the diagonal of the confusion matrix (conservative approach). // Alternative ways are also possible. WorkerAccuracy = results[indexModel].WorkerConfusionMatrixMean.ToDictionary( kvp => kvp.Key, kvp => Results.GetConfusionMatrixDiagonal(kvp.Value).Max()); break; case WorkerSelectionMethod.RandomWorker: // Assign worker accuracies to random values WorkerAccuracy = results[indexModel].FullMapping.WorkerIdToIndex.ToDictionary(kvp => kvp.Key, kvp => Rand.Double()); break; default: throw new ApplicationException("No worker selection method selected"); } // Create a list of tuples (TaskId, WorkerId, ActiveLearningResult) List<Tuple<string, string, ActiveLearningResult>> LabelValue = new List<Tuple<string, string, ActiveLearningResult>>(); foreach (var kvp in TaskUtility) { foreach (var workerId in remainingWorkersPerTask[kvp.Key]) { var labelValue = new ActiveLearningResult { WorkerId = workerId, TaskId = kvp.Key, TaskValue = kvp.Value.TaskValue, WorkerValue = WorkerAccuracy[workerId] }; LabelValue.Add(Tuple.Create(labelValue.TaskId, labelValue.WorkerId, labelValue)); } } // Increment tha active set with new data nextData[indexModel] = GetNextData(groupedRandomisedData, LabelValue, currentCounts, totalCounts, remainingWorkersPerTask, numIncremData); if (nextData[indexModel] == null || nextData[indexModel].Count == 0) break; indexArray[indexModel] += nextData[indexModel].Count; subDataArray[indexModel].AddRange(nextData[indexModel]); // Logs if (calculateAccuracy) { accuracyArray[indexModel].Add(results[indexModel].Accuracy); avgRecallArray[indexModel].Add(results[indexModel].AvgRecall); if (TaskUtility == null) { var sortedLabelValue = LabelValue.OrderByDescending(kvp => kvp.Item3.TaskValue).ToArray(); taskValueListArray[indexModel].Add(sortedLabelValue.First().Item3); } else { //Adding WorkerId into taskValueListArray ActiveLearningResult nextTaskValueItem = TaskUtility[nextData[indexModel].First().TaskId]; nextTaskValueItem.WorkerId = nextData[indexModel].First().WorkerId; nextTaskValueItem.TaskId = nextData[indexModel].First().TaskId; taskValueListArray[indexModel].Add(nextTaskValueItem); } if (doSnapShot) { Debug.WriteLine("{0} of {1}:\t{2}\t{3:0.000}\t{4:0.0000}", indexArray[indexModel], totalInstances, modelName[indexModel], accuracyArray[indexModel].Last(), avgRecallArray[indexModel].Last()); } } }//end of models }//end for all data }
/// <summary> /// Runs the standard active learning procedure on a model instance and an input data set. /// </summary> /// <param name="data">The data.</param> /// <param name="modelName">The model name.</param> /// <param name="runType">The model run type.</param> /// <param name="model">The model instance.</param> /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param> /// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param> /// <param name="resultsDir">The directory to save the log files.</param> /// <param name="communityCount">The number of communities (only for CBCC).</param> /// <param name="initialNumLabelsPerTask">The initial number of exploratory labels that are randomly selected for each task.</param> /// <param name="numIncremData">The number of data points to add at each round.</param> public static void RunActiveLearning( IList<Datum> data, string modelName, RunType runType, BCC model, TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod, string resultsDir, int communityCount = -1, int initialNumLabelsPerTask = 1, int numIncremData = 1) { //Count elapsed time Stopwatch stopWatchTotal = new Stopwatch(); stopWatchTotal.Start(); int totalLabels = data.Count(); // Dictionary keyed by task Id, with randomly order labelings var groupedRandomisedData = data.GroupBy(d => d.TaskId). Select(g => { var arr = g.ToArray(); int cnt = arr.Length; var perm = Rand.Perm(cnt); return new { key = g.Key, arr = g.Select((t, i) => arr[perm[i]]).ToArray() }; }).ToDictionary(a => a.key, a => a.arr); // Dictionary keyed by task Id, with label counts Dictionary<string, int> totalCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Length); // Keyed by task, value is a HashSet containing all the remaining workers with a label - workers are removed after adding a new datum Dictionary<string, HashSet<string>> remainingWorkersPerTask = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => new HashSet<string>(kvp.Value.Select(dat => dat.WorkerId))); int numTaskIds = totalCounts.Count(); int totalInstances = initialNumLabelsPerTask > 0 ? data.Count - initialNumLabelsPerTask * numTaskIds : data.Count - numIncremData; //throw an exception if the totalInstances is less than or equals to zero if (totalInstances <= 0) { throw new System.Exception("The variable 'totalInstances' should be greater than zero"); } string[] WorkerIds = data.Select(d => d.WorkerId).Distinct().ToArray(); //only creat accuracy list when it's null (for GUI Use) if (accuracy == null) { accuracy = new List<double>(); } List<double> avgRecall = new List<double>(); //List<ActiveLearningResult> taskValueList = new List<ActiveLearningResult>(); taskValueList = new List<ActiveLearningResult>(); int index = 0; Console.WriteLine("Active Learning: {0}", modelName); Console.WriteLine("\t\t\t\t\t\tAcc\tAvgRec"); // Get initial data Results results = new Results(); Dictionary<string, int> currentCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => initialNumLabelsPerTask); List<Datum> subData = GetSubdata(groupedRandomisedData, currentCounts, remainingWorkersPerTask); var s = remainingWorkersPerTask.Select(w => w.Value.Count).Sum(); List<Datum> nextData = null; ActiveLearning activeLearning = null; isExperimentCompleted = false; for (int iter = 0; ; iter++) //run until data run out { bool calculateAccuracy = true; bool doSnapShot = iter % 1 == 0; if (subData != null || nextData != null) { switch (runType) { case RunType.VoteDistribution: results.RunMajorityVote(subData, data, calculateAccuracy, true); break; case RunType.MajorityVote: results.RunMajorityVote(subData, data, calculateAccuracy, false); break; case RunType.DawidSkene: results.RunDawidSkene(subData, data, calculateAccuracy); break; default: // Run BCC models results.RunBCC(modelName, subData, data, model, RunMode.ClearResults, calculateAccuracy, communityCount, false); break; } } if (activeLearning == null) { activeLearning = new ActiveLearning(data, model, results, communityCount); } else { activeLearning.UpdateActiveLearningResults(results); } // We create a list of task utilities // TaskValue: Dictionary keyed by task, the value is an active learning result. Dictionary<string, ActiveLearningResult> TaskUtility = null; switch (taskSelectionMethod) { case TaskSelectionMethod.EntropyTask: TaskUtility = activeLearning.EntropyTrueLabel(); break; case TaskSelectionMethod.RandomTask: TaskUtility = data.GroupBy(d => d.TaskId).ToDictionary(a => a.Key, a => new ActiveLearningResult { TaskValue = Rand.Double() }); break; case TaskSelectionMethod.UniformTask: // Reproduce uniform task selection by picking the task with the lowest number of current labels. That is, minus the current count. TaskUtility = currentCounts.OrderBy(kvp => kvp.Value).ToDictionary(a => a.Key, a => new ActiveLearningResult { TaskId = a.Key, TaskValue = -a.Value }); break; default: TaskUtility = activeLearning.EntropyTrueLabel(); break; } // We create a list of worker utilities. Dictionary<string, double> WorkerAccuracy = null; // Best worker selection is only allowed for methods that infer worker confusion matrices. if (results.WorkerConfusionMatrix == null) workerSelectionMethod = WorkerSelectionMethod.RandomWorker; switch (workerSelectionMethod) { case WorkerSelectionMethod.BestWorker: // Assign worker accuracies to the maximum value on the diagonal of the confusion matrix (conservative approach). // Alternative ways are also possible. WorkerAccuracy = results.WorkerConfusionMatrixMean.ToDictionary( kvp => kvp.Key, kvp => Results.GetConfusionMatrixDiagonal(kvp.Value).Max()); break; case WorkerSelectionMethod.RandomWorker: // Assign worker accuracies to random values WorkerAccuracy = results.FullMapping.WorkerIdToIndex.ToDictionary(kvp => kvp.Key, kvp => Rand.Double()); break; default: throw new ApplicationException("No worker selection method selected"); } // Create a list of tuples (TaskIds, WorkerId, ActiveLearningResult). List<Tuple<string, string, ActiveLearningResult>> LabelValue = new List<Tuple<string,string,ActiveLearningResult>>(); foreach (var kvp in TaskUtility) { foreach (var workerId in remainingWorkersPerTask[kvp.Key]) { var labelValue = new ActiveLearningResult { WorkerId = workerId, TaskId = kvp.Key, TaskValue = kvp.Value.TaskValue, WorkerValue = WorkerAccuracy[workerId] }; LabelValue.Add(Tuple.Create(labelValue.TaskId, labelValue.WorkerId, labelValue)); } } // Increment tha active set with new data nextData = GetNextData(groupedRandomisedData, LabelValue, currentCounts, totalCounts, remainingWorkersPerTask, numIncremData); if (nextData == null || nextData.Count == 0) break; index += nextData.Count; subData.AddRange(nextData); // Logs if (calculateAccuracy) { accuracy.Add(results.Accuracy); avgRecall.Add(results.AvgRecall); if (TaskUtility == null) { var sortedLabelValue = LabelValue.OrderByDescending(kvp => kvp.Item3.TaskValue).ToArray(); taskValueList.Add(sortedLabelValue.First().Item3); } else { //Adding WorkerId into taskValueList ActiveLearningResult nextTaskValueItem = TaskUtility[nextData.First().TaskId]; nextTaskValueItem.WorkerId = nextData.First().WorkerId; //add taskID nextTaskValueItem.TaskId = nextData.First().TaskId; taskValueList.Add(nextTaskValueItem); } if (doSnapShot) { Console.WriteLine("{0} (label {1} of {2}):\t{3:0.000}\t{4:0.0000}", modelName, index, totalInstances, accuracy.Last(), avgRecall.Last()); //DoSnapshot(accuracy, nlpd, avgRecall, taskValueList, results, modelName, "interim", resultsDir, initialNumLabelsPerTask); } }//end if logs }//end for all data isExperimentCompleted = true; stopWatchTotal.Stop(); DoSnapshot(accuracy, avgRecall, taskValueList, results, modelName, "final", resultsDir, initialNumLabelsPerTask); ResetAccuracyList(); Console.WriteLine("Elapsed time: {0}\n", stopWatchTotal.Elapsed); }
/// <summary> /// Returns the model name as a string. /// </summary> /// <param name="dataset">The name of the data set.</param> /// <param name="runType">The model run type.</param> /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param> /// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param> /// <param name="numCommunities">The number of communities (only for CBCC).</param> /// <returns>The model name</returns> public static string GetModelName(string dataset, RunType runType, TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod, int numCommunities = -1) { return(dataset + "_" + Enum.GetName(typeof(RunType), runType) + "_" + Enum.GetName(typeof(TaskSelectionMethod), taskSelectionMethod)); }
/// <summary> /// Runs the active learning experiment presented in Venanzi et.al (WWW14) /// for all the models with an array of data sets. /// </summary> /// <param name="startIndex">First instance of the data set array.</param> /// <param name="endIndex">Last instance of the data set array.</param> /// <param name="whichModel">Model to run.</param> public static void RunHCOMPExperiments(int startIndex, int endIndex, int whichModel, TaskSelectionMethod currentTaskSelectionMethod, int InitialNumLabelsPerTask) { //Select current task selection method(Entropy/Random) //TaskSelectionMethod currentTaskSelectionMethod = TaskSelectionMethod.EntropyTask; for (int ds = startIndex; ds <= endIndex; ds++) { switch (whichModel) { case 1: RunHCOMPActiveLearning(Program.Datasets[ds], RunType.MajorityVote, currentTaskSelectionMethod, InitialNumLabelsPerTask, null); break; case 2: RunHCOMPActiveLearning(Program.Datasets[ds], RunType.DawidSkene, currentTaskSelectionMethod, InitialNumLabelsPerTask, null); break; case 3: RunHCOMPActiveLearning(Program.Datasets[ds], RunType.BCC, currentTaskSelectionMethod, InitialNumLabelsPerTask, new BCC()); break; case 4: RunHCOMPActiveLearning(Program.Datasets[ds], RunType.CBCC, currentTaskSelectionMethod, InitialNumLabelsPerTask, new CBCC(), Program.NumCommunities[ds]); break; default: // Run all RunHCOMPActiveLearning(Program.Datasets[ds], RunType.MajorityVote, currentTaskSelectionMethod, InitialNumLabelsPerTask, null); RunHCOMPActiveLearning(Program.Datasets[ds], RunType.DawidSkene, currentTaskSelectionMethod, InitialNumLabelsPerTask, null); RunHCOMPActiveLearning(Program.Datasets[ds], RunType.BCC, currentTaskSelectionMethod, InitialNumLabelsPerTask, new BCC()); RunHCOMPActiveLearning(Program.Datasets[ds], RunType.CBCC, currentTaskSelectionMethod, InitialNumLabelsPerTask, new CBCC(), Program.NumCommunities[ds]); RunHCOMPActiveLearning(Program.Datasets[ds], RunType.BCC, currentTaskSelectionMethod, InitialNumLabelsPerTask, new CBCC()); break; } } }
/// <summary> /// Constructor for non-EntropyMABTask Selection Method /// </summary> /// <param name="taskSelectionMethod"></param> /// <param name="runType"></param> /// <param name="numberOfLabellingRound"></param> /// <param name="labelStartingPoint"></param> public ExperimentModel(TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod, RunType runType, int numberOfLabellingRound, int labelStartingPoint) : this(runType, numberOfLabellingRound, labelStartingPoint) { this.taskSelectionMethod = taskSelectionMethod; this.WorkerSelectionMethod = workerSelectionMethod; }
private static string GetModelName(string dataset, RunType runType, TaskSelectionMethod taskSelectionMetric, WorkerSelectionMethod workerSelectionMetric, bool online, int taskSamples = -1, int workerSamples = -1, int numCommunities = -1) { return dataset + "_" + Enum.GetName(typeof(RunType), runType) + "_" + Enum.GetName(typeof(TaskSelectionMethod), taskSelectionMetric) + "_" + Enum.GetName(typeof(WorkerSelectionMethod), workerSelectionMetric) + (online ? "Online" : "") + (taskSamples > 0 ? "_T" + taskSamples.ToString() : "") + (workerSamples > 0 ? "_W" + workerSamples.ToString() : "") + (numCommunities > 0 ? "_Comm" + numCommunities.ToString() : ""); }
/// <summary> /// Background Thread for running the active learning experiment /// <param name="worker"></param> /// <param name="e"></param> public void RunParallelActiveLearning( System.ComponentModel.BackgroundWorker worker, System.ComponentModel.DoWorkEventArgs e) { //Create a state of the Thread CurrentParallelState currentState = new CurrentParallelState(); //Set setting in the experimentSetting Class int totalNumberOfModels = GetNumberOfExperiemntModels(); //Clear previous results ActiveLearning.ResetParallelAccuracyList(totalNumberOfModels); //obtain the accuracy list reference accuracyArrayOfAllExperimentModels = ActiveLearning.accuracyArray; //The RunTypes that have Worker Confusion Matrices RunType[] runTypesHaveWorkerMatrices = { RunType.DawidSkene, RunType.BCC, RunType.CBCC }; //Set the models selected in the setting pane string[] currentModelNames = new string[totalNumberOfModels]; RunType[] currentRunTypes = new RunType[totalNumberOfModels]; TaskSelectionMethod[] currentTaskSelectionMethods = new TaskSelectionMethod[totalNumberOfModels]; WorkerSelectionMethod[] currentWorkerSelectionMethods = new WorkerSelectionMethod[totalNumberOfModels]; BCC[] currentBCCModels = new BCC[totalNumberOfModels]; //for each ExperimentModel, set runTypeArray, taskSelectionMethodArray, workerSelectionMethodArray... for (int i = 0; i < totalNumberOfModels; i++) { ExperimentModel currentExperimentModel = GetExperimentModel(i); RunType currentRunType = currentExperimentModel.runType; currentRunTypes[i] = currentRunType; //set the task selection method currentTaskSelectionMethods[i] = currentExperimentModel.taskSelectionMethod; //Add into worker selection method array if the runType can have worker selection if (runTypesHaveWorkerMatrices.Contains(currentRunType)) { currentWorkerSelectionMethods[i] = currentExperimentModel.WorkerSelectionMethod; //Add corresponding model //if the RunType is BCC, add into BCC model array if (currentRunType == RunType.BCC) { currentBCCModels[i] = new BCC(); }//CBCC Model else if(currentRunType == RunType.CBCC) { CBCC currentBCCmodel = new CBCC(); currentBCCModels[i] = currentBCCmodel; } } //end if the runType has worker confusion matrices } //end for currentModelNames = currentModelNames.Select((s, i) => CrowdsourcingModels.Program.GetModelName(currentDataset.GetDataSetNameWithoutExtension(), currentRunTypes[i])).ToArray(); //run RunParallelActiveLearning in the ActiveLearning ActiveLearning.RunParallelActiveLearning(currentDataset.LoadData(), currentModelNames, currentRunTypes, currentBCCModels, currentTaskSelectionMethods, currentWorkerSelectionMethods, communityCount, numberOfLabellingRound); currentState.isRunningComplete = true; Debug.WriteLine("RunParallelActiveLearning Complete"); //isSimulationComplete = true; //worker.ReportProgress(0, currentState); }//end function RunParallelActiveLearning
/// <summary> /// Returns the model name as a string. /// </summary> /// <param name="dataset">The name of the data set.</param> /// <param name="runType">The model run type.</param> /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param> /// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param> /// <returns>The model name</returns> public static string GetModelName(string dataset, RunType runType, TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod) { return dataset + "_" + Enum.GetName(typeof(RunType), runType) + "_" + (!taskSelectionMethod.Equals("") ? Enum.GetName(typeof(TaskSelectionMethod), taskSelectionMethod) : "") + "_" + (!workerSelectionMethod.Equals("") ? Enum.GetName(typeof(WorkerSelectionMethod), workerSelectionMethod) : ""); }
/// <summary> /// Background Thread for running the active learning experiment /// <param name="worker"></param> /// <param name="e"></param> public void RunParallelActiveLearning( System.ComponentModel.BackgroundWorker worker, System.ComponentModel.DoWorkEventArgs e) { //Create a state of the Thread CurrentParallelState currentState = new CurrentParallelState(); //Set setting in the experimentSetting Class int totalNumberOfModels = GetNumberOfExperiemntModels(); //Clear previous results ActiveLearning.ResetParallelAccuracyList(totalNumberOfModels); //obtain the accuracy list reference accuracyArrayOfAllExperimentModels = ActiveLearning.accuracyArray; //The RunTypes that have Worker Confusion Matrices RunType[] runTypesHaveWorkerMatrices = { RunType.DawidSkene, RunType.BCC, RunType.CBCC }; //Set the models selected in the setting pane string[] currentModelNames = new string[totalNumberOfModels]; RunType[] currentRunTypes = new RunType[totalNumberOfModels]; TaskSelectionMethod[] currentTaskSelectionMethods = new TaskSelectionMethod[totalNumberOfModels]; WorkerSelectionMethod[] currentWorkerSelectionMethods = new WorkerSelectionMethod[totalNumberOfModels]; BCC[] currentBCCModels = new BCC[totalNumberOfModels]; //for each ExperimentModel, set runTypeArray, taskSelectionMethodArray, workerSelectionMethodArray... for (int i = 0; i < totalNumberOfModels; i++) { ExperimentModel currentExperimentModel = GetExperimentModel(i); RunType currentRunType = currentExperimentModel.runType; currentRunTypes[i] = currentRunType; //set the task selection method currentTaskSelectionMethods[i] = currentExperimentModel.taskSelectionMethod; //Add into worker selection method array if the runType can have worker selection if (runTypesHaveWorkerMatrices.Contains(currentRunType)) { currentWorkerSelectionMethods[i] = currentExperimentModel.WorkerSelectionMethod; //Add corresponding model //if the RunType is BCC, add into BCC model array if (currentRunType == RunType.BCC) { currentBCCModels[i] = new BCC(); }//CBCC Model else if (currentRunType == RunType.CBCC) { CBCC currentBCCmodel = new CBCC(); currentBCCModels[i] = currentBCCmodel; } } //end if the runType has worker confusion matrices } //end for currentModelNames = currentModelNames.Select((s, i) => CrowdsourcingModels.Program.GetModelName(currentDataset.GetDataSetNameWithoutExtension(), currentRunTypes[i])).ToArray(); //run RunParallelActiveLearning in the ActiveLearning ActiveLearning.RunParallelActiveLearning(currentDataset.LoadData(), currentModelNames, currentRunTypes, currentBCCModels, currentTaskSelectionMethods, currentWorkerSelectionMethods, communityCount, numberOfLabellingRound); currentState.isRunningComplete = true; Debug.WriteLine("RunParallelActiveLearning Complete"); //isSimulationComplete = true; //worker.ReportProgress(0, currentState); }//end function RunParallelActiveLearning
/// <summary> /// Runs the standard active learning procedure on a model instance and an input data set. /// </summary> /// <param name="data">The data.</param> /// <param name="modelName">The model name.</param> /// <param name="runType">The model run type.</param> /// <param name="model">The model instance.</param> /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param> /// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param> /// <param name="resultsDir">The directory to save the log files.</param> /// <param name="communityCount">The number of communities (only for CBCC).</param> /// <param name="initialNumLabelsPerTask">The initial number of exploratory labels that are randomly selected for each task.</param> public static void RunActiveLearning(IList <Datum> data, string modelName, RunType runType, BCC model, TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod, string resultsDir, int communityCount = -1, int initialNumLabelsPerTask = 1) { //Count elapsed time Stopwatch stopWatchTotal = new Stopwatch(); stopWatchTotal.Start(); int totalLabels = data.Count(); // Dictionary keyed by task Id, with randomly order labelings var groupedRandomisedData = data.GroupBy(d => d.TaskId). Select(g => { var arr = g.ToArray(); int cnt = arr.Length; var perm = Rand.Perm(cnt); return(new { key = g.Key, arr = g.Select((t, i) => arr[perm[i]]).ToArray() }); }).ToDictionary(a => a.key, a => a.arr); // Dictionary keyed by task Id, with label counts Dictionary <string, int> totalCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Length); Dictionary <string, int> currentCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => initialNumLabelsPerTask); // Keyed by task, value is a HashSet containing all the remaining workers with a label - workers are removed after adding a new datum Dictionary <string, HashSet <string> > remainingWorkersPerTask = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => new HashSet <string>(kvp.Value.Select(dat => dat.WorkerId))); int numTaskIds = totalCounts.Count(); int totalInstances = data.Count - initialNumLabelsPerTask * numTaskIds; string[] WorkerIds = data.Select(d => d.WorkerId).Distinct().ToArray(); // Log structures List <double> accuracy = new List <double>(); List <double> nlpd = new List <double>(); List <double> avgRecall = new List <double>(); List <ActiveLearningResult> taskValueList = new List <ActiveLearningResult>(); int index = 0; Console.WriteLine("Active Learning: {0}", modelName); Console.WriteLine("\t\tAcc\tAvgRec"); // Get initial data Results results = new Results(); List <Datum> subData = null; subData = GetSubdata(groupedRandomisedData, currentCounts, remainingWorkersPerTask); var s = remainingWorkersPerTask.Select(w => w.Value.Count).Sum(); List <Datum> nextData = null; int numIncremData = 3; ActiveLearning activeLearning = null; for (int iter = 0; iter < 500; iter++) { bool calculateAccuracy = true; ////bool doSnapShot = iter % 100 == 0; // Frequency of snapshots bool doSnapShot = true; if (subData != null || nextData != null) { switch (runType) { case RunType.VoteDistribution: results.RunMajorityVote(subData, calculateAccuracy, true); break; case RunType.MajorityVote: results.RunMajorityVote(subData, calculateAccuracy, false); break; case RunType.DawidSkene: results.RunDawidSkene(subData, calculateAccuracy); break; default: // Run BCC models results.RunBCC(modelName, subData, data, model, Results.RunMode.ClearResults, calculateAccuracy, communityCount, false); break; } } if (activeLearning == null) { activeLearning = new ActiveLearning(data, model, results, communityCount); } else { activeLearning.UpdateActiveLearningResults(results); } // Select next task Dictionary <string, ActiveLearningResult> TaskValue = null; List <Tuple <string, string, ActiveLearningResult> > LabelValue = null; switch (taskSelectionMethod) { case TaskSelectionMethod.EntropyTask: TaskValue = activeLearning.EntropyTrueLabelPosterior(); break; case TaskSelectionMethod.RandomTask: TaskValue = data.GroupBy(d => d.TaskId).ToDictionary(a => a.Key, a => new ActiveLearningResult { TaskValue = Rand.Double() }); break; default: // Entropy task selection TaskValue = activeLearning.EntropyTrueLabelPosterior(); break; } nextData = GetNextData(groupedRandomisedData, TaskValue, currentCounts, totalCounts, numIncremData); if (nextData == null || nextData.Count == 0) { break; } index += nextData.Count; subData.AddRange(nextData); // Logs if (calculateAccuracy) { accuracy.Add(results.Accuracy); nlpd.Add(results.NegativeLogProb); avgRecall.Add(results.AvgRecall); if (TaskValue == null) { var sortedLabelValue = LabelValue.OrderByDescending(kvp => kvp.Item3.TaskValue).ToArray(); taskValueList.Add(sortedLabelValue.First().Item3); } else { taskValueList.Add(TaskValue[nextData.First().TaskId]); } if (doSnapShot) { Console.WriteLine("{0} of {1}:\t{2:0.000}\t{3:0.0000}", index, totalInstances, accuracy.Last(), avgRecall.Last()); DoSnapshot(accuracy, nlpd, avgRecall, taskValueList, results, modelName, "interim", resultsDir); } } } stopWatchTotal.Stop(); DoSnapshot(accuracy, nlpd, avgRecall, taskValueList, results, modelName, "final", resultsDir); Console.WriteLine("Elapsed time: {0}\n", stopWatchTotal.Elapsed); }
/// <summary> /// Returns the model name as a string. /// </summary> /// <param name="dataset">The name of the data set.</param> /// <param name="runType">The model run type.</param> /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param> /// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param> /// <returns>The model name</returns> public static string GetModelName(string dataset, RunType runType, TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod) { return(dataset + "_" + Enum.GetName(typeof(RunType), runType) + "_" + (!taskSelectionMethod.Equals("") ? Enum.GetName(typeof(TaskSelectionMethod), taskSelectionMethod) : "") + "_" + (!workerSelectionMethod.Equals("") ? Enum.GetName(typeof(WorkerSelectionMethod), workerSelectionMethod) : "")); }