Ejemplo n.º 1
0
 private static string GetModelName(string dataset, RunType runType, TaskSelectionMethod taskSelectionMetric, WorkerSelectionMethod workerSelectionMetric, bool online, int taskSamples = -1, int workerSamples = -1, int numCommunities = -1)
 {
     return(dataset + "_" + Enum.GetName(typeof(RunType), runType)
            + "_" + Enum.GetName(typeof(TaskSelectionMethod), taskSelectionMetric)
            + "_" + Enum.GetName(typeof(WorkerSelectionMethod), workerSelectionMetric)
            + (online ? "Online" : "") + (taskSamples > 0 ? "_T" + taskSamples.ToString() : "")
            + (workerSamples > 0 ? "_W" + workerSamples.ToString() : "")
            + (numCommunities > 0 ? "_Comm" + numCommunities.ToString() : ""));
 }
Ejemplo n.º 2
0
        /// <summary>
        /// Runs the active learning experiment presented in Venanzi et.al (WWW14) on a single data set.
        /// </summary>
        /// <param name="dataSet">The data.</param>
        /// <param name="runType">The model run type.</param>
        /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param>
        /// <param name="model">The model instance.</param>
        /// <param name="communityCount">The number of communities (only for CBCC).</param>
        static void RunWWWActiveLearning(string dataSet, RunType runType, TaskSelectionMethod taskSelectionMethod, BCC model, int communityCount = 4)
        {
            // Reset the random seed so results can be duplicated for the paper
            Rand.Restart(12347);
            var    workerSelectionMethod = WorkerSelectionMethod.RandomWorker;
            var    data      = Datum.LoadData(@"Data\" + dataSet + ".csv");
            string modelName = GetModelName(dataSet, runType, taskSelectionMethod, workerSelectionMethod, communityCount);

            ActiveLearning.RunActiveLearning(data, modelName, runType, model, taskSelectionMethod, workerSelectionMethod, ResultsDir, communityCount);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Runs the active learning experiment presented in Venanzi et.al (WWW14) on a single data set.
        /// </summary>
        /// <param name="dataSet">The data.</param>
        /// <param name="runType">The model run type.</param>
        /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param>
        /// <param name="model">The model instance.</param>
        /// <param name="communityCount">The number of communities (only for CBCC).</param>
        public static void RunHCOMPActiveLearning(string dataSet, RunType runType, TaskSelectionMethod taskSelectionMethod, int InitialNumLabelsPerTask, BCC model, int communityCount = 4)
        {
            var    data      = Datum.LoadData(@"Data/" + dataSet + ".csv");
            string modelName = Program.GetModelName(dataSet, runType, taskSelectionMethod, WorkerSelectionMethod.RandomWorker);

            //initial Number of Label Per Task
            //int initialNumLabelsPerTask = 1;
            int initialNumLabelsPerTask = InitialNumLabelsPerTask;

            ActiveLearning.RunActiveLearning(data, modelName, runType, model, taskSelectionMethod, WorkerSelectionMethod.RandomWorker, ResultsDir, communityCount, initialNumLabelsPerTask);
        }
        } //End AddModel

        /// <summary>
        /// Initial the experimentItem according to the previous setting
        /// </summary>
        /// <param name="currentRunType"></param>
        /// <param name="currentTaskSelectionMethod"></param>
        /// <param name="currentWorkerSelectionMethod"></param>
        /// <param name="labelStartingPoints"></param>
        /// <param name="totalNumberOfLabels"></param>
        /// <returns></returns>
        private ExperimentModel getExperimentItem(RunType currentRunType, TaskSelectionMethod currentTaskSelectionMethod, WorkerSelectionMethod currentWorkerSelectionMethod, int[] labelStartingPoints)
        {
            //if the RunType is MajorityVote, no TaskSelectionMethods would be selected
            if (currentRunType == RunType.MajorityVote)
            {
                return(new ExperimentModel(currentTaskSelectionMethod, WorkerSelectionMethod.RandomWorker, currentRunType, 1, labelStartingPoints[0]));
            }

            //if it is an entropy task, add the different labelling rounds
            if (currentTaskSelectionMethod == TaskSelectionMethod.EntropyTask)
            {
                int currentLabellingRound = trackBarNumberOfLabellingRounds.Value;
                return(new ExperimentModel(currentTaskSelectionMethod, currentWorkerSelectionMethod, currentRunType, currentLabellingRound, labelStartingPoints[currentLabellingRound - 1]));
            }
            else//other taskSelectionMethods, or empty in the batch running
            {
                return(new ExperimentModel(currentTaskSelectionMethod, currentWorkerSelectionMethod, currentRunType, 1, labelStartingPoints[0]));
            }//end if
        }
        } //End AddModel


        /// <summary>
        /// Initial the experimentItem according to the previous setting
        /// </summary>
        /// <param name="currentRunType"></param>
        /// <param name="currentTaskSelectionMethod"></param>
        /// <param name="currentWorkerSelectionMethod"></param>
        /// <param name="labelStartingPoints"></param>
        /// <param name="totalNumberOfLabels"></param>
        /// <returns></returns>
        private ExperimentModel getExperimentItem(RunType currentRunType, TaskSelectionMethod currentTaskSelectionMethod, WorkerSelectionMethod currentWorkerSelectionMethod, int[] labelStartingPoints)
        {
            //if the RunType is MajorityVote, no TaskSelectionMethods would be selected
            if (currentRunType == RunType.MajorityVote)
            {
                return new ExperimentModel(currentTaskSelectionMethod, WorkerSelectionMethod.RandomWorker, currentRunType, 1, labelStartingPoints[0]);
            }

            //if it is an entropy task, add the different labelling rounds
            if (currentTaskSelectionMethod == TaskSelectionMethod.EntropyTask)
            {
                int currentLabellingRound = trackBarNumberOfLabellingRounds.Value;
                return new ExperimentModel(currentTaskSelectionMethod, currentWorkerSelectionMethod, currentRunType, currentLabellingRound, labelStartingPoints[currentLabellingRound - 1]);
            }
            else//other taskSelectionMethods, or empty in the batch running 
            {
                return new ExperimentModel(currentTaskSelectionMethod, currentWorkerSelectionMethod, currentRunType, 1, labelStartingPoints[0]);
            }//end if  

        }
        /// <summary>
        /// Runs the standard active learning procedure in parallel on an array of model instances and an input data set.
        /// </summary>
        /// <param name="data">The data.</param>
        /// <param name="modelName">The model name.</param>
        /// <param name="runType">The model run type.</param>
        /// <param name="model">The model instance.</param>
        /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param>
        /// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param>
        /// <param name="numIncremData">The number of data points to add at each iteration.</param>
        /// <param name="communityCount">The number of communities (only for CBCC).</param>
        /// <param name="initialNumLabelsPerTask">The initial number of exploratory labels that are randomly selected for each task.</param>
        public static void RunParallelActiveLearning(IList<Datum> data,
            string[] modelName,
            RunType[] runType,
            BCC[] model,
            TaskSelectionMethod[] taskSelectionMethod,
            WorkerSelectionMethod[] workerSelectionMethod,
            int communityCount = -1,
            int initialNumLabelsPerTask = 1,
            int numIncremData = 1)
        {

            int numModels = runType.Length;
            Stopwatch stopWatch = new Stopwatch();
            int totalLabels = data.Count();

            // Dictionary keyed by task Id, with randomly order labelings
            var groupedRandomisedData =
                data.GroupBy(d => d.TaskId).
                Select(g =>
                {
                    var arr = g.ToArray();
                    int cnt = arr.Length;
                    var perm = Rand.Perm(cnt);
                    return new
                    {
                        key = g.Key,
                        arr = g.Select((t, i) => arr[perm[i]]).ToArray()
                    };
                }).ToDictionary(a => a.key, a => a.arr);

            // Dictionary keyed by task Id, with label counts
            Dictionary<string, int> totalCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Length);
            Dictionary<string, int> currentCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => initialNumLabelsPerTask);

            // Keyed by task, value is a HashSet containing all the remaining workers with a label - workers are removed after adding a new datum 
            Dictionary<string, HashSet<string>> remainingWorkersPerTask = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => new HashSet<string>(kvp.Value.Select(dat => dat.WorkerId)));


            int numTaskIds = totalCounts.Count();
            int totalInstances = data.Count - initialNumLabelsPerTask * numTaskIds;

            //throw an exception if the totalInstances is less than or equals to zero
            if (totalInstances <= 0)
            {
                throw new System.Exception("The variable 'totalInstances' should be greater than zero");
            }

            //only creates accuracy list when it's null (for GUI Use)
            if (accuracyArray == null)
            {
                accuracyArray = Util.ArrayInit<List<double>>(numModels, i => new List<double>());

            }

            List<double>[] avgRecallArray = Util.ArrayInit(numModels, i => new List<double>());
            taskValueListArray = Util.ArrayInit(numModels, i => new List<ActiveLearningResult>());
            int[] indexArray = new int[numModels];

            Debug.WriteLine("Parallel Active Learning");
            Debug.WriteLine("\tModel\tAcc\tAvgRec");

            // Get initial data
            //make the results variable be global for GUi
            results = Util.ArrayInit<Results>(numModels, i => new Results());
            List<Datum> subData = GetSubdata(groupedRandomisedData, currentCounts, remainingWorkersPerTask);
            List<Datum>[] subDataArray = Util.ArrayInit<List<Datum>>(numModels, i => new List<Datum>(subData));
            List<Datum>[] nextData = new List<Datum>[numModels];
            ActiveLearning[] activeLearning = new ActiveLearning[numModels];
            isExperimentCompleted = false;
        
            // Main loop
            for (int iter = 0; ; iter++)
            {
                bool calculateAccuracy = true;
                bool doSnapShot = iter % 100 == 0; // Frequency of snapshots
                
                //stop Active Learning if the user requests to stop
                if (isExperimentCompleted) {
                    return;
                
                }

                // Run all the models
                for (int indexModel = 0; indexModel < numModels; indexModel++ )
                {
                    if (subDataArray[indexModel] != null || nextData[indexModel] != null)
                    {
                        switch (runType[indexModel])
                        {
                            case RunType.VoteDistribution:
                                results[indexModel].RunMajorityVote(subDataArray[indexModel], data, calculateAccuracy, true);
                                break;
                            case RunType.MajorityVote:
                                results[indexModel].RunMajorityVote(subDataArray[indexModel], data, calculateAccuracy, false);
                                break;
                            case RunType.DawidSkene:
                                results[indexModel].RunDawidSkene(subDataArray[indexModel], data, calculateAccuracy);
                                break;
                            default: // Run BCC models
                                results[indexModel].RunBCC(modelName[indexModel], subDataArray[indexModel], data, model[indexModel], RunMode.ClearResults, calculateAccuracy, communityCount, false);
                                break;
                        }
                    } //end for running all the data

                    if (activeLearning[indexModel] == null)
                    {
                        activeLearning[indexModel] = new ActiveLearning(data, model[indexModel], results[indexModel], communityCount);
                    }
                    else
                    {
                        activeLearning[indexModel].UpdateActiveLearningResults(results[indexModel]);
                    }


                    // Select next task
                    Dictionary<string, ActiveLearningResult> TaskUtility = new Dictionary<string, ActiveLearningResult>();
                    switch (taskSelectionMethod[indexModel]) 
                    {
                        case TaskSelectionMethod.EntropyTask:
                            TaskUtility = activeLearning[indexModel].EntropyTrueLabel();
                            break;

                        case TaskSelectionMethod.RandomTask:
                            TaskUtility = data.GroupBy(d => d.TaskId).ToDictionary(a => a.Key, a => new ActiveLearningResult
                            {
                                TaskValue = Rand.Double()
                            });
                            break;

                        case TaskSelectionMethod.UniformTask:
                            //add task value according to the count left
                            TaskUtility = currentCounts.OrderBy(kvp => kvp.Value).ToDictionary(a => a.Key, a => new ActiveLearningResult
                            {
                                TaskValue = 1
                            });
                            break;

                        default: // Entropy task selection
                            TaskUtility = activeLearning[indexModel].EntropyTrueLabel();
                            break;
                    }

                    // We create a list of worker utilities
                    Dictionary<string, double> WorkerAccuracy = null;

                    // Best worker selection is only allowed for methods that infer worker confusion matrices.
                    if (results[indexModel].WorkerConfusionMatrix == null)
                        workerSelectionMethod[indexModel] = WorkerSelectionMethod.RandomWorker;

                    switch (workerSelectionMethod[indexModel])
                    {
                        case WorkerSelectionMethod.BestWorker:
                            // Assign worker accuracies to the maximum value on the diagonal of the confusion matrix (conservative approach).
                            // Alternative ways are also possible.
                            WorkerAccuracy = results[indexModel].WorkerConfusionMatrixMean.ToDictionary(
                                    kvp => kvp.Key,
                                    kvp => Results.GetConfusionMatrixDiagonal(kvp.Value).Max());
                            break;
                        case WorkerSelectionMethod.RandomWorker:
                            // Assign worker accuracies to random values
                            WorkerAccuracy = results[indexModel].FullMapping.WorkerIdToIndex.ToDictionary(kvp => kvp.Key, kvp => Rand.Double());
                            break;
                        default:
                            throw new ApplicationException("No worker selection method selected");
                    }

                    // Create a list of tuples (TaskId, WorkerId, ActiveLearningResult)
                    List<Tuple<string, string, ActiveLearningResult>> LabelValue = new List<Tuple<string, string, ActiveLearningResult>>();
                    foreach (var kvp in TaskUtility)
                    {
                        foreach (var workerId in remainingWorkersPerTask[kvp.Key])
                        {
                            var labelValue = new ActiveLearningResult
                            {
                                WorkerId = workerId,
                                TaskId = kvp.Key,
                                TaskValue = kvp.Value.TaskValue,
                                WorkerValue = WorkerAccuracy[workerId]
                            };
                            LabelValue.Add(Tuple.Create(labelValue.TaskId, labelValue.WorkerId, labelValue));
                        }
                    }

                    // Increment tha active set with new data
                    nextData[indexModel] = GetNextData(groupedRandomisedData, LabelValue, currentCounts, totalCounts, remainingWorkersPerTask, numIncremData);

                    if (nextData[indexModel] == null || nextData[indexModel].Count == 0)
                        break;


                    indexArray[indexModel] += nextData[indexModel].Count;
                    subDataArray[indexModel].AddRange(nextData[indexModel]);

                    // Logs
                    if (calculateAccuracy)
                    {
                        accuracyArray[indexModel].Add(results[indexModel].Accuracy);
                        avgRecallArray[indexModel].Add(results[indexModel].AvgRecall);

                        if (TaskUtility == null)
                        {
                            var sortedLabelValue = LabelValue.OrderByDescending(kvp => kvp.Item3.TaskValue).ToArray();
                            taskValueListArray[indexModel].Add(sortedLabelValue.First().Item3);
                        }
                        else
                        {

                            //Adding WorkerId into taskValueListArray
                            ActiveLearningResult nextTaskValueItem = TaskUtility[nextData[indexModel].First().TaskId];
                            nextTaskValueItem.WorkerId = nextData[indexModel].First().WorkerId;
                            nextTaskValueItem.TaskId = nextData[indexModel].First().TaskId;
                            taskValueListArray[indexModel].Add(nextTaskValueItem);
                        }

                        if (doSnapShot)
                        {
                            Debug.WriteLine("{0} of {1}:\t{2}\t{3:0.000}\t{4:0.0000}", indexArray[indexModel], totalInstances, modelName[indexModel], accuracyArray[indexModel].Last(), avgRecallArray[indexModel].Last());
                        }
                    }
                }//end of models
            }//end for all data
        }
        /// <summary>
        /// Runs the standard active learning procedure on a model instance and an input data set.
        /// </summary>
        /// <param name="data">The data.</param>
        /// <param name="modelName">The model name.</param>
        /// <param name="runType">The model run type.</param>
        /// <param name="model">The model instance.</param>
        /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param>
        /// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param>
        /// <param name="resultsDir">The directory to save the log files.</param>
        /// <param name="communityCount">The number of communities (only for CBCC).</param>
        /// <param name="initialNumLabelsPerTask">The initial number of exploratory labels that are randomly selected for each task.</param>
        /// <param name="numIncremData">The number of data points to add at each round.</param>
        public static void RunActiveLearning(
            IList<Datum> data,
            string modelName,
            RunType runType,
            BCC model,
            TaskSelectionMethod taskSelectionMethod,
            WorkerSelectionMethod workerSelectionMethod,
            string resultsDir,
            int communityCount = -1,
            int initialNumLabelsPerTask = 1,
            int numIncremData = 1)
        {
            //Count elapsed time
            Stopwatch stopWatchTotal = new Stopwatch();
            stopWatchTotal.Start();
            int totalLabels = data.Count();

            // Dictionary keyed by task Id, with randomly order labelings
            var groupedRandomisedData =
                data.GroupBy(d => d.TaskId).
                Select(g =>
                {
                    var arr = g.ToArray();
                    int cnt = arr.Length;
                    var perm = Rand.Perm(cnt);
                    return new
                    {
                        key = g.Key,
                        arr = g.Select((t, i) => arr[perm[i]]).ToArray()
                    };
                }).ToDictionary(a => a.key, a => a.arr);

            // Dictionary keyed by task Id, with label counts
            Dictionary<string, int> totalCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Length);
            
            // Keyed by task, value is a HashSet containing all the remaining workers with a label - workers are removed after adding a new datum 
            Dictionary<string, HashSet<string>> remainingWorkersPerTask = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => new HashSet<string>(kvp.Value.Select(dat => dat.WorkerId)));
            int numTaskIds = totalCounts.Count();

            int totalInstances = initialNumLabelsPerTask > 0 ? data.Count - initialNumLabelsPerTask * numTaskIds : data.Count - numIncremData;

            //throw an exception if the totalInstances is less than or equals to zero
            if (totalInstances <= 0)
            {
                throw new System.Exception("The variable 'totalInstances' should be greater than zero");
            }
            
            string[] WorkerIds = data.Select(d => d.WorkerId).Distinct().ToArray();

            //only creat accuracy list when it's null (for GUI Use)
            if (accuracy == null)
            { 
                accuracy = new List<double>();
            }
            
            List<double> avgRecall = new List<double>();
            //List<ActiveLearningResult> taskValueList = new List<ActiveLearningResult>();
            taskValueList = new List<ActiveLearningResult>();
            int index = 0;

            Console.WriteLine("Active Learning: {0}", modelName);
            Console.WriteLine("\t\t\t\t\t\tAcc\tAvgRec");

            // Get initial data
            Results results = new Results();
            Dictionary<string, int> currentCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => initialNumLabelsPerTask);
            List<Datum> subData = GetSubdata(groupedRandomisedData, currentCounts, remainingWorkersPerTask);

            var s = remainingWorkersPerTask.Select(w => w.Value.Count).Sum();
            List<Datum> nextData = null;
            ActiveLearning activeLearning = null;
            isExperimentCompleted = false;
            for (int iter = 0; ; iter++) //run until data run out
            {
                bool calculateAccuracy = true;
                bool doSnapShot = iter % 1 == 0;
                if (subData != null || nextData != null)
                {
                    switch (runType)
                    {
                        case RunType.VoteDistribution:
                            results.RunMajorityVote(subData, data, calculateAccuracy, true);
                            break;
                        case RunType.MajorityVote:
                            results.RunMajorityVote(subData, data, calculateAccuracy, false);
                            break;
                        case RunType.DawidSkene:
                            results.RunDawidSkene(subData, data, calculateAccuracy);
                            break;
                        default: // Run BCC models
                            results.RunBCC(modelName, subData, data, model, RunMode.ClearResults, calculateAccuracy, communityCount, false);
                            break;
                    }
                }

                if (activeLearning == null)
                {
                    activeLearning = new ActiveLearning(data, model, results, communityCount);
                }
                else
                {
                    activeLearning.UpdateActiveLearningResults(results);
                }

                // We create a list of task utilities
                // TaskValue: Dictionary keyed by task, the value is an active learning result.
                Dictionary<string, ActiveLearningResult> TaskUtility = null;
                switch (taskSelectionMethod)
                {
                    case TaskSelectionMethod.EntropyTask:
                        TaskUtility = activeLearning.EntropyTrueLabel();
                        break;

                    case TaskSelectionMethod.RandomTask:
                        TaskUtility = data.GroupBy(d => d.TaskId).ToDictionary(a => a.Key, a => new ActiveLearningResult
                        {
                            TaskValue = Rand.Double()

                        });
                        break;

                    case TaskSelectionMethod.UniformTask:
                        // Reproduce uniform task selection by picking the task with the lowest number of current labels. That is, minus the current count.
                        TaskUtility = currentCounts.OrderBy(kvp => kvp.Value).ToDictionary(a => a.Key, a => new ActiveLearningResult
                        {
                            TaskId = a.Key,
                            TaskValue = -a.Value
                        });
                        break;

                    default:
                        TaskUtility = activeLearning.EntropyTrueLabel();
                        break;
                }

                
                // We create a list of worker utilities.
                Dictionary<string, double> WorkerAccuracy = null;

                // Best worker selection is only allowed for methods that infer worker confusion matrices.
                if (results.WorkerConfusionMatrix == null)
                    workerSelectionMethod = WorkerSelectionMethod.RandomWorker;

                switch (workerSelectionMethod)
                {
                    case WorkerSelectionMethod.BestWorker:
                        // Assign worker accuracies to the maximum value on the diagonal of the confusion matrix (conservative approach).
                        // Alternative ways are also possible.
                        WorkerAccuracy = results.WorkerConfusionMatrixMean.ToDictionary(
                                kvp => kvp.Key,
                                kvp => Results.GetConfusionMatrixDiagonal(kvp.Value).Max());
                        break;
                    case WorkerSelectionMethod.RandomWorker:
                        // Assign worker accuracies to random values
                        WorkerAccuracy = results.FullMapping.WorkerIdToIndex.ToDictionary(kvp => kvp.Key, kvp => Rand.Double());
                        break;
                    default:
                        throw new ApplicationException("No worker selection method selected");
                }

                // Create a list of tuples (TaskIds, WorkerId, ActiveLearningResult).
                List<Tuple<string, string, ActiveLearningResult>> LabelValue = new List<Tuple<string,string,ActiveLearningResult>>();
                foreach (var kvp in TaskUtility)
                {
                    foreach (var workerId in remainingWorkersPerTask[kvp.Key])
                    {
                        var labelValue = new ActiveLearningResult
                        {
                            WorkerId = workerId,
                            TaskId = kvp.Key,
                            TaskValue = kvp.Value.TaskValue,
                            WorkerValue = WorkerAccuracy[workerId]
                        };
                        LabelValue.Add(Tuple.Create(labelValue.TaskId, labelValue.WorkerId, labelValue));
                    }
                }
                
                // Increment tha active set with new data
                nextData = GetNextData(groupedRandomisedData, LabelValue, currentCounts, totalCounts, remainingWorkersPerTask, numIncremData);

                if (nextData == null || nextData.Count == 0)
                    break;

                index += nextData.Count;
                subData.AddRange(nextData);

                // Logs
                if (calculateAccuracy)
                {
                    accuracy.Add(results.Accuracy);
                    avgRecall.Add(results.AvgRecall);

                    if (TaskUtility == null) 
                    {
                        var sortedLabelValue = LabelValue.OrderByDescending(kvp => kvp.Item3.TaskValue).ToArray();
                        taskValueList.Add(sortedLabelValue.First().Item3);
                    }
                    else 
                    {

                        //Adding WorkerId into taskValueList 
                        ActiveLearningResult nextTaskValueItem = TaskUtility[nextData.First().TaskId];
                        nextTaskValueItem.WorkerId = nextData.First().WorkerId;

                        //add taskID
                        nextTaskValueItem.TaskId = nextData.First().TaskId;

                        taskValueList.Add(nextTaskValueItem);
                    }

                    if (doSnapShot)
                    {
                        Console.WriteLine("{0} (label {1} of {2}):\t{3:0.000}\t{4:0.0000}", modelName, index, totalInstances, accuracy.Last(), avgRecall.Last());
                        //DoSnapshot(accuracy, nlpd, avgRecall, taskValueList, results, modelName, "interim", resultsDir, initialNumLabelsPerTask);
                    }
                }//end if logs
            }//end for all data

            isExperimentCompleted = true;

            stopWatchTotal.Stop();
            DoSnapshot(accuracy, avgRecall, taskValueList, results, modelName, "final", resultsDir, initialNumLabelsPerTask);
            ResetAccuracyList();
            Console.WriteLine("Elapsed time: {0}\n", stopWatchTotal.Elapsed);
        }
Ejemplo n.º 8
0
 /// <summary>
 /// Returns the model name as a string.
 /// </summary>
 /// <param name="dataset">The name of the data set.</param>
 /// <param name="runType">The model run type.</param>
 /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param>
 /// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param>
 /// <param name="numCommunities">The number of communities (only for CBCC).</param>
 /// <returns>The model name</returns>
 public static string GetModelName(string dataset, RunType runType, TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod, int numCommunities = -1)
 {
     return(dataset + "_" + Enum.GetName(typeof(RunType), runType)
            + "_" + Enum.GetName(typeof(TaskSelectionMethod), taskSelectionMethod));
 }
Ejemplo n.º 9
0
        /// <summary>
        /// Runs the active learning experiment presented in Venanzi et.al (WWW14)
        /// for all the models with an array of data sets.
        /// </summary>
        /// <param name="startIndex">First instance of the data set array.</param>
        /// <param name="endIndex">Last instance of the data set array.</param>
        /// <param name="whichModel">Model to run.</param>
        public static void RunHCOMPExperiments(int startIndex, int endIndex, int whichModel, TaskSelectionMethod currentTaskSelectionMethod, int InitialNumLabelsPerTask)
        {
            //Select current task selection method(Entropy/Random)
            //TaskSelectionMethod currentTaskSelectionMethod = TaskSelectionMethod.EntropyTask;

            for (int ds = startIndex; ds <= endIndex; ds++)
            {
                switch (whichModel)
                {
                case 1: RunHCOMPActiveLearning(Program.Datasets[ds], RunType.MajorityVote, currentTaskSelectionMethod, InitialNumLabelsPerTask, null); break;

                case 2: RunHCOMPActiveLearning(Program.Datasets[ds], RunType.DawidSkene, currentTaskSelectionMethod, InitialNumLabelsPerTask, null); break;

                case 3: RunHCOMPActiveLearning(Program.Datasets[ds], RunType.BCC, currentTaskSelectionMethod, InitialNumLabelsPerTask, new BCC()); break;

                case 4: RunHCOMPActiveLearning(Program.Datasets[ds], RunType.CBCC, currentTaskSelectionMethod, InitialNumLabelsPerTask, new CBCC(), Program.NumCommunities[ds]); break;

                default:     // Run all
                    RunHCOMPActiveLearning(Program.Datasets[ds], RunType.MajorityVote, currentTaskSelectionMethod, InitialNumLabelsPerTask, null);
                    RunHCOMPActiveLearning(Program.Datasets[ds], RunType.DawidSkene, currentTaskSelectionMethod, InitialNumLabelsPerTask, null);
                    RunHCOMPActiveLearning(Program.Datasets[ds], RunType.BCC, currentTaskSelectionMethod, InitialNumLabelsPerTask, new BCC());
                    RunHCOMPActiveLearning(Program.Datasets[ds], RunType.CBCC, currentTaskSelectionMethod, InitialNumLabelsPerTask, new CBCC(), Program.NumCommunities[ds]);
                    RunHCOMPActiveLearning(Program.Datasets[ds], RunType.BCC, currentTaskSelectionMethod, InitialNumLabelsPerTask, new CBCC());
                    break;
                }
            }
        }
 /// <summary>
 /// Constructor for non-EntropyMABTask Selection Method
 /// </summary>
 /// <param name="taskSelectionMethod"></param>
 /// <param name="runType"></param>
 /// <param name="numberOfLabellingRound"></param>
 /// <param name="labelStartingPoint"></param>
 public ExperimentModel(TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod, RunType runType, int numberOfLabellingRound, int labelStartingPoint)
     : this(runType, numberOfLabellingRound, labelStartingPoint)
 {
     this.taskSelectionMethod   = taskSelectionMethod;
     this.WorkerSelectionMethod = workerSelectionMethod;
 }
Ejemplo n.º 11
0
 private static string GetModelName(string dataset, RunType runType, TaskSelectionMethod taskSelectionMetric, WorkerSelectionMethod workerSelectionMetric, bool online, int taskSamples = -1, int workerSamples = -1, int numCommunities = -1)
 {
     return dataset + "_" + Enum.GetName(typeof(RunType), runType)
         + "_" + Enum.GetName(typeof(TaskSelectionMethod), taskSelectionMetric)
         + "_" + Enum.GetName(typeof(WorkerSelectionMethod), workerSelectionMetric)
         + (online ? "Online" : "") + (taskSamples > 0 ? "_T" + taskSamples.ToString() : "")
         + (workerSamples > 0 ? "_W" + workerSamples.ToString() : "")
         + (numCommunities > 0 ? "_Comm" + numCommunities.ToString() : "");
 }
        /// <summary>
        /// Background Thread for running the active learning experiment
        /// <param name="worker"></param>
        /// <param name="e"></param>
        public void RunParallelActiveLearning(
            System.ComponentModel.BackgroundWorker worker,
            System.ComponentModel.DoWorkEventArgs e)
        {

            //Create a state of the Thread
            CurrentParallelState currentState = new CurrentParallelState();
        
            //Set setting in the experimentSetting Class
            int totalNumberOfModels = GetNumberOfExperiemntModels();
            //Clear previous results
            ActiveLearning.ResetParallelAccuracyList(totalNumberOfModels);

            //obtain the accuracy list reference
            accuracyArrayOfAllExperimentModels = ActiveLearning.accuracyArray;
       
            //The RunTypes that have Worker Confusion Matrices
            RunType[] runTypesHaveWorkerMatrices = { RunType.DawidSkene, RunType.BCC, RunType.CBCC };

            //Set the models selected in the setting pane
            string[] currentModelNames = new string[totalNumberOfModels];
            RunType[] currentRunTypes = new RunType[totalNumberOfModels];
            TaskSelectionMethod[] currentTaskSelectionMethods = new TaskSelectionMethod[totalNumberOfModels];
            WorkerSelectionMethod[] currentWorkerSelectionMethods = new WorkerSelectionMethod[totalNumberOfModels];
            BCC[] currentBCCModels = new BCC[totalNumberOfModels];

            //for each ExperimentModel, set runTypeArray, taskSelectionMethodArray, workerSelectionMethodArray...
            for (int i = 0; i < totalNumberOfModels; i++)
            {
                ExperimentModel currentExperimentModel = GetExperimentModel(i);
                RunType currentRunType = currentExperimentModel.runType;
                currentRunTypes[i] = currentRunType;

                //set the task selection method
                currentTaskSelectionMethods[i] = currentExperimentModel.taskSelectionMethod;  

                //Add into worker selection method array if the runType can have worker selection 
                if (runTypesHaveWorkerMatrices.Contains(currentRunType))
                {
                    currentWorkerSelectionMethods[i] = currentExperimentModel.WorkerSelectionMethod;

                    //Add corresponding model
                    //if the RunType is BCC, add into BCC model array
                    if (currentRunType == RunType.BCC)
                    {
                        currentBCCModels[i] = new BCC();
                    }//CBCC Model
                    else if(currentRunType == RunType.CBCC)
                    {
                        CBCC currentBCCmodel = new CBCC();
                        currentBCCModels[i] = currentBCCmodel;
                    }
                } //end if the runType has worker confusion matrices
            } //end for

            currentModelNames = currentModelNames.Select((s, i) => CrowdsourcingModels.Program.GetModelName(currentDataset.GetDataSetNameWithoutExtension(), currentRunTypes[i])).ToArray();

            //run RunParallelActiveLearning in the ActiveLearning  
            ActiveLearning.RunParallelActiveLearning(currentDataset.LoadData(), currentModelNames, currentRunTypes, 
                currentBCCModels, currentTaskSelectionMethods, currentWorkerSelectionMethods, 
                communityCount, numberOfLabellingRound);

            currentState.isRunningComplete = true;
            Debug.WriteLine("RunParallelActiveLearning Complete");
            //isSimulationComplete = true;
            //worker.ReportProgress(0, currentState);

        }//end function RunParallelActiveLearning
 /// <summary>
 /// Constructor for non-EntropyMABTask Selection Method
 /// </summary>
 /// <param name="taskSelectionMethod"></param>
 /// <param name="runType"></param>
 /// <param name="numberOfLabellingRound"></param>
 /// <param name="labelStartingPoint"></param>
 public ExperimentModel(TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod, RunType runType, int numberOfLabellingRound, int labelStartingPoint)
     : this(runType, numberOfLabellingRound, labelStartingPoint)
 {
     this.taskSelectionMethod = taskSelectionMethod;
     this.WorkerSelectionMethod = workerSelectionMethod;
    
 }
Ejemplo n.º 14
0
 /// <summary>
 /// Returns the model name as a string.
 /// </summary>
 /// <param name="dataset">The name of the data set.</param>
 /// <param name="runType">The model run type.</param>
 /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param>
 /// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param>
 /// <returns>The model name</returns>
 public static string GetModelName(string dataset, RunType runType, TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod)
 {
     return dataset + "_" + Enum.GetName(typeof(RunType), runType)
         + "_" + (!taskSelectionMethod.Equals("") ? Enum.GetName(typeof(TaskSelectionMethod), taskSelectionMethod) : "")
         + "_" + (!workerSelectionMethod.Equals("") ? Enum.GetName(typeof(WorkerSelectionMethod), workerSelectionMethod) : "");
 }
Ejemplo n.º 15
0
        /// <summary>
        /// Background Thread for running the active learning experiment
        /// <param name="worker"></param>
        /// <param name="e"></param>
        public void RunParallelActiveLearning(
            System.ComponentModel.BackgroundWorker worker,
            System.ComponentModel.DoWorkEventArgs e)
        {
            //Create a state of the Thread
            CurrentParallelState currentState = new CurrentParallelState();

            //Set setting in the experimentSetting Class
            int totalNumberOfModels = GetNumberOfExperiemntModels();

            //Clear previous results
            ActiveLearning.ResetParallelAccuracyList(totalNumberOfModels);

            //obtain the accuracy list reference
            accuracyArrayOfAllExperimentModels = ActiveLearning.accuracyArray;

            //The RunTypes that have Worker Confusion Matrices
            RunType[] runTypesHaveWorkerMatrices = { RunType.DawidSkene, RunType.BCC, RunType.CBCC };

            //Set the models selected in the setting pane
            string[]                currentModelNames             = new string[totalNumberOfModels];
            RunType[]               currentRunTypes               = new RunType[totalNumberOfModels];
            TaskSelectionMethod[]   currentTaskSelectionMethods   = new TaskSelectionMethod[totalNumberOfModels];
            WorkerSelectionMethod[] currentWorkerSelectionMethods = new WorkerSelectionMethod[totalNumberOfModels];
            BCC[] currentBCCModels = new BCC[totalNumberOfModels];

            //for each ExperimentModel, set runTypeArray, taskSelectionMethodArray, workerSelectionMethodArray...
            for (int i = 0; i < totalNumberOfModels; i++)
            {
                ExperimentModel currentExperimentModel = GetExperimentModel(i);
                RunType         currentRunType         = currentExperimentModel.runType;
                currentRunTypes[i] = currentRunType;

                //set the task selection method
                currentTaskSelectionMethods[i] = currentExperimentModel.taskSelectionMethod;

                //Add into worker selection method array if the runType can have worker selection
                if (runTypesHaveWorkerMatrices.Contains(currentRunType))
                {
                    currentWorkerSelectionMethods[i] = currentExperimentModel.WorkerSelectionMethod;

                    //Add corresponding model
                    //if the RunType is BCC, add into BCC model array
                    if (currentRunType == RunType.BCC)
                    {
                        currentBCCModels[i] = new BCC();
                    }//CBCC Model
                    else if (currentRunType == RunType.CBCC)
                    {
                        CBCC currentBCCmodel = new CBCC();
                        currentBCCModels[i] = currentBCCmodel;
                    }
                } //end if the runType has worker confusion matrices
            }     //end for

            currentModelNames = currentModelNames.Select((s, i) => CrowdsourcingModels.Program.GetModelName(currentDataset.GetDataSetNameWithoutExtension(), currentRunTypes[i])).ToArray();

            //run RunParallelActiveLearning in the ActiveLearning
            ActiveLearning.RunParallelActiveLearning(currentDataset.LoadData(), currentModelNames, currentRunTypes,
                                                     currentBCCModels, currentTaskSelectionMethods, currentWorkerSelectionMethods,
                                                     communityCount, numberOfLabellingRound);

            currentState.isRunningComplete = true;
            Debug.WriteLine("RunParallelActiveLearning Complete");
            //isSimulationComplete = true;
            //worker.ReportProgress(0, currentState);
        }//end function RunParallelActiveLearning
Ejemplo n.º 16
0
        /// <summary>
        /// Runs the standard active learning procedure on a model instance and an input data set.
        /// </summary>
        /// <param name="data">The data.</param>
        /// <param name="modelName">The model name.</param>
        /// <param name="runType">The model run type.</param>
        /// <param name="model">The model instance.</param>
        /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param>
        /// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param>
        /// <param name="resultsDir">The directory to save the log files.</param>
        /// <param name="communityCount">The number of communities (only for CBCC).</param>
        /// <param name="initialNumLabelsPerTask">The initial number of exploratory labels that are randomly selected for each task.</param>
        public static void RunActiveLearning(IList <Datum> data, string modelName, RunType runType, BCC model, TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod, string resultsDir, int communityCount = -1, int initialNumLabelsPerTask = 1)
        {
            //Count elapsed time
            Stopwatch stopWatchTotal = new Stopwatch();

            stopWatchTotal.Start();
            int totalLabels = data.Count();

            // Dictionary keyed by task Id, with randomly order labelings
            var groupedRandomisedData =
                data.GroupBy(d => d.TaskId).
                Select(g =>
            {
                var arr  = g.ToArray();
                int cnt  = arr.Length;
                var perm = Rand.Perm(cnt);
                return(new
                {
                    key = g.Key,
                    arr = g.Select((t, i) => arr[perm[i]]).ToArray()
                });
            }).ToDictionary(a => a.key, a => a.arr);

            // Dictionary keyed by task Id, with label counts
            Dictionary <string, int> totalCounts   = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Length);
            Dictionary <string, int> currentCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => initialNumLabelsPerTask);

            // Keyed by task, value is a HashSet containing all the remaining workers with a label - workers are removed after adding a new datum
            Dictionary <string, HashSet <string> > remainingWorkersPerTask = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => new HashSet <string>(kvp.Value.Select(dat => dat.WorkerId)));
            int numTaskIds     = totalCounts.Count();
            int totalInstances = data.Count - initialNumLabelsPerTask * numTaskIds;

            string[] WorkerIds = data.Select(d => d.WorkerId).Distinct().ToArray();

            // Log structures
            List <double> accuracy  = new List <double>();
            List <double> nlpd      = new List <double>();
            List <double> avgRecall = new List <double>();
            List <ActiveLearningResult> taskValueList = new List <ActiveLearningResult>();
            int index = 0;

            Console.WriteLine("Active Learning: {0}", modelName);
            Console.WriteLine("\t\tAcc\tAvgRec");

            // Get initial data
            Results      results = new Results();
            List <Datum> subData = null;

            subData = GetSubdata(groupedRandomisedData, currentCounts, remainingWorkersPerTask);
            var            s              = remainingWorkersPerTask.Select(w => w.Value.Count).Sum();
            List <Datum>   nextData       = null;
            int            numIncremData  = 3;
            ActiveLearning activeLearning = null;


            for (int iter = 0; iter < 500; iter++)
            {
                bool calculateAccuracy = true;
                ////bool doSnapShot = iter % 100 == 0; // Frequency of snapshots
                bool doSnapShot = true;
                if (subData != null || nextData != null)
                {
                    switch (runType)
                    {
                    case RunType.VoteDistribution:
                        results.RunMajorityVote(subData, calculateAccuracy, true);
                        break;

                    case RunType.MajorityVote:
                        results.RunMajorityVote(subData, calculateAccuracy, false);
                        break;

                    case RunType.DawidSkene:
                        results.RunDawidSkene(subData, calculateAccuracy);
                        break;

                    default:     // Run BCC models
                        results.RunBCC(modelName, subData, data, model, Results.RunMode.ClearResults, calculateAccuracy, communityCount, false);
                        break;
                    }
                }

                if (activeLearning == null)
                {
                    activeLearning = new ActiveLearning(data, model, results, communityCount);
                }
                else
                {
                    activeLearning.UpdateActiveLearningResults(results);
                }

                // Select next task
                Dictionary <string, ActiveLearningResult>            TaskValue  = null;
                List <Tuple <string, string, ActiveLearningResult> > LabelValue = null;
                switch (taskSelectionMethod)
                {
                case TaskSelectionMethod.EntropyTask:
                    TaskValue = activeLearning.EntropyTrueLabelPosterior();
                    break;

                case TaskSelectionMethod.RandomTask:
                    TaskValue = data.GroupBy(d => d.TaskId).ToDictionary(a => a.Key, a => new ActiveLearningResult
                    {
                        TaskValue = Rand.Double()
                    });
                    break;

                default:     // Entropy task selection
                    TaskValue = activeLearning.EntropyTrueLabelPosterior();
                    break;
                }

                nextData = GetNextData(groupedRandomisedData, TaskValue, currentCounts, totalCounts, numIncremData);

                if (nextData == null || nextData.Count == 0)
                {
                    break;
                }

                index += nextData.Count;
                subData.AddRange(nextData);

                // Logs
                if (calculateAccuracy)
                {
                    accuracy.Add(results.Accuracy);
                    nlpd.Add(results.NegativeLogProb);
                    avgRecall.Add(results.AvgRecall);

                    if (TaskValue == null)
                    {
                        var sortedLabelValue = LabelValue.OrderByDescending(kvp => kvp.Item3.TaskValue).ToArray();
                        taskValueList.Add(sortedLabelValue.First().Item3);
                    }
                    else
                    {
                        taskValueList.Add(TaskValue[nextData.First().TaskId]);
                    }

                    if (doSnapShot)
                    {
                        Console.WriteLine("{0} of {1}:\t{2:0.000}\t{3:0.0000}", index, totalInstances, accuracy.Last(), avgRecall.Last());
                        DoSnapshot(accuracy, nlpd, avgRecall, taskValueList, results, modelName, "interim", resultsDir);
                    }
                }
            }
            stopWatchTotal.Stop();
            DoSnapshot(accuracy, nlpd, avgRecall, taskValueList, results, modelName, "final", resultsDir);
            Console.WriteLine("Elapsed time: {0}\n", stopWatchTotal.Elapsed);
        }
Ejemplo n.º 17
0
 /// <summary>
 /// Returns the model name as a string.
 /// </summary>
 /// <param name="dataset">The name of the data set.</param>
 /// <param name="runType">The model run type.</param>
 /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param>
 /// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param>
 /// <returns>The model name</returns>
 public static string GetModelName(string dataset, RunType runType, TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod)
 {
     return(dataset + "_" + Enum.GetName(typeof(RunType), runType)
            + "_" + (!taskSelectionMethod.Equals("") ? Enum.GetName(typeof(TaskSelectionMethod), taskSelectionMethod) : "")
            + "_" + (!workerSelectionMethod.Equals("") ? Enum.GetName(typeof(WorkerSelectionMethod), workerSelectionMethod) : ""));
 }