/// <summary>
        /// Constructs an active learning instance with a specified data set and model instance.
        /// </summary>
        /// <param name="data">The data.</param>
        /// <param name="model">The model instance.</param>
        /// <param name="results">The results instance.</param>
        /// <param name="numCommunities">The number of communities (only for CBCC).</param>
        public ActiveLearning(IList<Datum> data, BCC model, Results results, int numCommunities)
        {
            this.bcc = model;
            CBCC communityModel = model as CBCC;
            IsCommunityModel = (communityModel != null);

            ActiveLearningResults = results;
            BatchResults = results;
            isExperimentCompleted = false;


            // Builds the full matrix of data from every task and worker
            PredictionData = new List<Datum>();

        }
Exemplo n.º 2
0
        /// <summary>
        /// Runs a model with the full gold set.
        /// </summary>
        /// <param name="dataSet">The dataset name.</param>
        /// <param name="data">The data.</param>
        /// <param name="runType">The model run type.</param>
        /// <param name="model">The model instance.</param>
        /// <param name="numCommunities">The number of communities (only for CBCC).</param>
        /// <returns>The inference results</returns>
        public static Results RunGold(string dataSet, IList<Datum> data, RunType runType, BCC model, int numCommunities = 2)
        {

            string modelName = Program.GetModelName(dataSet, runType);
            Results results = new Results();

            switch (runType)
            {
                case RunType.VoteDistribution:
                    results.RunMajorityVote(data, data, true, true);
                    break;
                case RunType.MajorityVote:
                    results.RunMajorityVote(data, data, true, false);
                    break;
                case RunType.DawidSkene:
                    results.RunDawidSkene(data, data, true);
                    break;
                default:
                    results.RunBCC(modelName, data, data, model, RunMode.ClearResults, true, numCommunities, false, false);
                    break;
            }
            
            return results;
        }
 /// <summary>
 /// Updates the active learning results object.
 /// </summary>
 /// <param name="results">The new results</param>
 public void UpdateActiveLearningResults(Results results)
 {
     ActiveLearningResults = results;
 }
        /// <summary>
        /// Saves the results of the inference and the model's parameters on csv files.
        /// </summary>
        /// <param name="accuracy">The list of accuracies evaluated on the gold labels at each active learning round.</param>
        /// <param name="avgRecall">The list of average recalls evaluated on the gold labels at each active learning round.</param>
        /// <param name="taskValue">The list of utilities of the task selected at each active learning round.</param>
        /// <param name="results">The result instance.</param>
        /// <param name="modelName">The model name.</param>
        /// <param name="suffix">The suffix of the csv files.</param>
        /// <param name="resultsDir">The directory to store the csv files.</param>
        /// <param name="projectInitialNumLabelsPerTask">The initial number of exploratory labels for each task.</param>
        public static void DoSnapshot(List<double> accuracy, List<double> avgRecall, List<ActiveLearningResult> taskValue, Results results, string modelName, string suffix, string resultsDir, int projectInitialNumLabelsPerTask)
        {
            suffix = suffix == "final" ? "" : suffix;
            String new_graph_csv_file_name = String.Format("{2}{0}__graph_{1}_InitialLabels_{3}.csv", modelName, suffix, resultsDir, projectInitialNumLabelsPerTask);

            using (StreamWriter writer = new StreamWriter(new_graph_csv_file_name))
            {
                var accArr = accuracy.ToArray();
                var avgRec = avgRecall.ToArray();
                writer.WriteLine("Accuracy,AvgRecall");
                for (int i = 0; i < accArr.Length; i++)
                {

                    // Edit this print line to get the accuracy printed if the format that you want.
                    writer.WriteLine("{0:0.0000},{1:0.0000}", accArr[i], avgRec[i]); // Accuracy and average recall
                }
            }
        }
        /// <summary>
        /// Runs the standard active learning procedure on a model instance and an input data set.
        /// </summary>
        /// <param name="data">The data.</param>
        /// <param name="modelName">The model name.</param>
        /// <param name="runType">The model run type.</param>
        /// <param name="model">The model instance.</param>
        /// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param>
        /// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param>
        /// <param name="resultsDir">The directory to save the log files.</param>
        /// <param name="communityCount">The number of communities (only for CBCC).</param>
        /// <param name="initialNumLabelsPerTask">The initial number of exploratory labels that are randomly selected for each task.</param>
        /// <param name="numIncremData">The number of data points to add at each round.</param>
        public static void RunActiveLearning(
            IList<Datum> data,
            string modelName,
            RunType runType,
            BCC model,
            TaskSelectionMethod taskSelectionMethod,
            WorkerSelectionMethod workerSelectionMethod,
            string resultsDir,
            int communityCount = -1,
            int initialNumLabelsPerTask = 1,
            int numIncremData = 1)
        {
            //Count elapsed time
            Stopwatch stopWatchTotal = new Stopwatch();
            stopWatchTotal.Start();
            int totalLabels = data.Count();

            // Dictionary keyed by task Id, with randomly order labelings
            var groupedRandomisedData =
                data.GroupBy(d => d.TaskId).
                Select(g =>
                {
                    var arr = g.ToArray();
                    int cnt = arr.Length;
                    var perm = Rand.Perm(cnt);
                    return new
                    {
                        key = g.Key,
                        arr = g.Select((t, i) => arr[perm[i]]).ToArray()
                    };
                }).ToDictionary(a => a.key, a => a.arr);

            // Dictionary keyed by task Id, with label counts
            Dictionary<string, int> totalCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Length);
            
            // Keyed by task, value is a HashSet containing all the remaining workers with a label - workers are removed after adding a new datum 
            Dictionary<string, HashSet<string>> remainingWorkersPerTask = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => new HashSet<string>(kvp.Value.Select(dat => dat.WorkerId)));
            int numTaskIds = totalCounts.Count();

            int totalInstances = initialNumLabelsPerTask > 0 ? data.Count - initialNumLabelsPerTask * numTaskIds : data.Count - numIncremData;

            //throw an exception if the totalInstances is less than or equals to zero
            if (totalInstances <= 0)
            {
                throw new System.Exception("The variable 'totalInstances' should be greater than zero");
            }
            
            string[] WorkerIds = data.Select(d => d.WorkerId).Distinct().ToArray();

            //only creat accuracy list when it's null (for GUI Use)
            if (accuracy == null)
            { 
                accuracy = new List<double>();
            }
            
            List<double> avgRecall = new List<double>();
            //List<ActiveLearningResult> taskValueList = new List<ActiveLearningResult>();
            taskValueList = new List<ActiveLearningResult>();
            int index = 0;

            Console.WriteLine("Active Learning: {0}", modelName);
            Console.WriteLine("\t\t\t\t\t\tAcc\tAvgRec");

            // Get initial data
            Results results = new Results();
            Dictionary<string, int> currentCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => initialNumLabelsPerTask);
            List<Datum> subData = GetSubdata(groupedRandomisedData, currentCounts, remainingWorkersPerTask);

            var s = remainingWorkersPerTask.Select(w => w.Value.Count).Sum();
            List<Datum> nextData = null;
            ActiveLearning activeLearning = null;
            isExperimentCompleted = false;
            for (int iter = 0; ; iter++) //run until data run out
            {
                bool calculateAccuracy = true;
                bool doSnapShot = iter % 1 == 0;
                if (subData != null || nextData != null)
                {
                    switch (runType)
                    {
                        case RunType.VoteDistribution:
                            results.RunMajorityVote(subData, data, calculateAccuracy, true);
                            break;
                        case RunType.MajorityVote:
                            results.RunMajorityVote(subData, data, calculateAccuracy, false);
                            break;
                        case RunType.DawidSkene:
                            results.RunDawidSkene(subData, data, calculateAccuracy);
                            break;
                        default: // Run BCC models
                            results.RunBCC(modelName, subData, data, model, RunMode.ClearResults, calculateAccuracy, communityCount, false);
                            break;
                    }
                }

                if (activeLearning == null)
                {
                    activeLearning = new ActiveLearning(data, model, results, communityCount);
                }
                else
                {
                    activeLearning.UpdateActiveLearningResults(results);
                }

                // We create a list of task utilities
                // TaskValue: Dictionary keyed by task, the value is an active learning result.
                Dictionary<string, ActiveLearningResult> TaskUtility = null;
                switch (taskSelectionMethod)
                {
                    case TaskSelectionMethod.EntropyTask:
                        TaskUtility = activeLearning.EntropyTrueLabel();
                        break;

                    case TaskSelectionMethod.RandomTask:
                        TaskUtility = data.GroupBy(d => d.TaskId).ToDictionary(a => a.Key, a => new ActiveLearningResult
                        {
                            TaskValue = Rand.Double()

                        });
                        break;

                    case TaskSelectionMethod.UniformTask:
                        // Reproduce uniform task selection by picking the task with the lowest number of current labels. That is, minus the current count.
                        TaskUtility = currentCounts.OrderBy(kvp => kvp.Value).ToDictionary(a => a.Key, a => new ActiveLearningResult
                        {
                            TaskId = a.Key,
                            TaskValue = -a.Value
                        });
                        break;

                    default:
                        TaskUtility = activeLearning.EntropyTrueLabel();
                        break;
                }

                
                // We create a list of worker utilities.
                Dictionary<string, double> WorkerAccuracy = null;

                // Best worker selection is only allowed for methods that infer worker confusion matrices.
                if (results.WorkerConfusionMatrix == null)
                    workerSelectionMethod = WorkerSelectionMethod.RandomWorker;

                switch (workerSelectionMethod)
                {
                    case WorkerSelectionMethod.BestWorker:
                        // Assign worker accuracies to the maximum value on the diagonal of the confusion matrix (conservative approach).
                        // Alternative ways are also possible.
                        WorkerAccuracy = results.WorkerConfusionMatrixMean.ToDictionary(
                                kvp => kvp.Key,
                                kvp => Results.GetConfusionMatrixDiagonal(kvp.Value).Max());
                        break;
                    case WorkerSelectionMethod.RandomWorker:
                        // Assign worker accuracies to random values
                        WorkerAccuracy = results.FullMapping.WorkerIdToIndex.ToDictionary(kvp => kvp.Key, kvp => Rand.Double());
                        break;
                    default:
                        throw new ApplicationException("No worker selection method selected");
                }

                // Create a list of tuples (TaskIds, WorkerId, ActiveLearningResult).
                List<Tuple<string, string, ActiveLearningResult>> LabelValue = new List<Tuple<string,string,ActiveLearningResult>>();
                foreach (var kvp in TaskUtility)
                {
                    foreach (var workerId in remainingWorkersPerTask[kvp.Key])
                    {
                        var labelValue = new ActiveLearningResult
                        {
                            WorkerId = workerId,
                            TaskId = kvp.Key,
                            TaskValue = kvp.Value.TaskValue,
                            WorkerValue = WorkerAccuracy[workerId]
                        };
                        LabelValue.Add(Tuple.Create(labelValue.TaskId, labelValue.WorkerId, labelValue));
                    }
                }
                
                // Increment tha active set with new data
                nextData = GetNextData(groupedRandomisedData, LabelValue, currentCounts, totalCounts, remainingWorkersPerTask, numIncremData);

                if (nextData == null || nextData.Count == 0)
                    break;

                index += nextData.Count;
                subData.AddRange(nextData);

                // Logs
                if (calculateAccuracy)
                {
                    accuracy.Add(results.Accuracy);
                    avgRecall.Add(results.AvgRecall);

                    if (TaskUtility == null) 
                    {
                        var sortedLabelValue = LabelValue.OrderByDescending(kvp => kvp.Item3.TaskValue).ToArray();
                        taskValueList.Add(sortedLabelValue.First().Item3);
                    }
                    else 
                    {

                        //Adding WorkerId into taskValueList 
                        ActiveLearningResult nextTaskValueItem = TaskUtility[nextData.First().TaskId];
                        nextTaskValueItem.WorkerId = nextData.First().WorkerId;

                        //add taskID
                        nextTaskValueItem.TaskId = nextData.First().TaskId;

                        taskValueList.Add(nextTaskValueItem);
                    }

                    if (doSnapShot)
                    {
                        Console.WriteLine("{0} (label {1} of {2}):\t{3:0.000}\t{4:0.0000}", modelName, index, totalInstances, accuracy.Last(), avgRecall.Last());
                        //DoSnapshot(accuracy, nlpd, avgRecall, taskValueList, results, modelName, "interim", resultsDir, initialNumLabelsPerTask);
                    }
                }//end if logs
            }//end for all data

            isExperimentCompleted = true;

            stopWatchTotal.Stop();
            DoSnapshot(accuracy, avgRecall, taskValueList, results, modelName, "final", resultsDir, initialNumLabelsPerTask);
            ResetAccuracyList();
            Console.WriteLine("Elapsed time: {0}\n", stopWatchTotal.Elapsed);
        }