/// <summary> /// computes the size of the set of entries that have the actual reviewer within the top 5 computed reviewers and is shared between all algorithms /// </summary> public void FindIntersectingEntriesForActualReviewerIds(AbstractSourceOfBugs source) { throw new NotImplementedException("This must be reimplemented to reflect the changes to StatisticsResult"); //IEnumerable<int> actualReviewerIds = source.findBugs(); //List<int> algorithmIds; //using (var context = new ExpertiseDBEntities()) // algorithmIds = context.Algorithms.Select(a => a.AlgorithmId).ToList(); //log.Debug("Setting up"); //List<List<StatisticsResult>> allStatistics = algorithmIds // // read statistics for every algorithm // .Select(algorithmId => File.ReadAllLines(string.Format(basepath + "stats_{0}.txt", algorithmId))) // // map the string[] with the statistics of each algorithm to a List<StatisticResult> for each algorithm // .Select(originalData => originalData.Select(StatisticsResult.FromCSVLine).ToList()).ToList(); //List<StatisticsResult> workingSet = allStatistics[1] // Why 1??? Maybe skip Line 10 rule, because it has at most one entry? // .Where(statResult => actualReviewerIds.Contains(statResult.ActualReviewerId) && statResult.AuthorWasFound).ToList(); //int count = workingSet.Count; //log.Info("Setup complete"); //for (int i = 2; i < algorithmIds.Count; i++) //{ // log.InfoFormat("Now testing against {0}, working set count: {1}", i, workingSet.Count); // workingSet = workingSet.Where(s => allStatistics[i].Any(stats => stats.ActualReviewerId == s.ActualReviewerId && stats.AuthorWasFound)).ToList(); //} //string sb = string.Format("{0} / {1} intersecting entries", workingSet.Count, count); //File.WriteAllText(string.Format(basepath + "stats{0}.txt", source.Postfix), sb); }
private void ComputeStatisticsForAlgorithmAndActualReviews(int algorithmId, AbstractSourceOfBugs source) { string[] originalData = File.ReadAllLines($"{basepath}stats_{algorithmId}.txt"); List <StatisticsResult> workingSet = originalData.Select(StatisticsResult.FromCSVLine) .Where(sr => source.findBugs().Contains(sr.BugId)).ToList(); int count = workingSet.Count(); StringBuilder batchSb = new StringBuilder(); StringBuilder batchDerivationSb = new StringBuilder(); for (int i = 0; i < count; i += BATCHSIZE) { ComputeBatchStatistic(batchSb, workingSet.Take(i).ToList()); int from = i; if (from < 0) { from = 0; } int take = BATCHSIZE; if (from + take > count) { take = count - from; } ComputeBatchDerivationStatistic(batchDerivationSb, workingSet.Skip(from).Take(take).ToList()); } ComputeBatchStatistic(batchSb, workingSet.Take(count).ToList()); File.WriteAllText($"{basepath}stats_{algorithmId}_analyzed{source.Postfix}-batch.csv", batchSb.ToString()); File.WriteAllText($"{basepath}stats_{algorithmId}_analyzed{source.Postfix}-deriv-batch.csv", batchDerivationSb.ToString()); int foundNo = workingSet.Count(sr => sr.IsMatch); int[] expertPlacements = new int[StatisticsResult.NUMBER_OF_EXPERTS]; for (int i = 0; i < StatisticsResult.NUMBER_OF_EXPERTS; i++) { expertPlacements[i] = workingSet.Count(sr => sr.Matches[i]); } StringBuilder sb = new StringBuilder(); sb.AppendLine($"Expert was found: {foundNo} / {count} ({(double)foundNo / count:P})"); for (int i = 0; i < StatisticsResult.NUMBER_OF_EXPERTS; i++) { sb.AppendLine($"Expert was No {i+1}: {expertPlacements[i]} / {count} ({(double)expertPlacements[i] / (double)count:P})"); } int top1 = expertPlacements[0]; int top3 = top1 + expertPlacements[1] + expertPlacements[2]; int top5 = top3 + expertPlacements[3] + expertPlacements[4]; sb.AppendLine().AppendLine($"Top-1: {(double)top1 / (double)count:P}"); sb.AppendLine($"Top-3: {(double)top3 / (double)count:P}"); sb.AppendLine($"Top-5: {(double)top5 / (double)count:P}"); File.WriteAllText($"{basepath}stats_{algorithmId}_analyzed{source.Postfix}.txt", sb.ToString()); }
/// <summary> /// Expects stats_x.txt entries for all algorithms. It then counts the hits and misses and computes /// the fraction of hits. Results are written to stats_x_analyzedPOSTFIX.txt. /// </summary> public void ComputeStatisticsForAllAlgorithmsAndActualReviews(AbstractSourceOfBugs source) { List <int> algorithmIds; using (var context = new ExpertiseDBEntities()) algorithmIds = context.Algorithms .Select(a => a.AlgorithmId).ToList() // perform the SQL query .Where(algoID => File.Exists($"{basepath}stats_{algoID}.txt")).ToList(); // check whether the file exists on disk Debug.Assert(algorithmIds.Any()); Parallel.ForEach(algorithmIds, algorithmId => ComputeStatisticsForAlgorithmAndActualReviews(algorithmId, source)); }
/// <summary> /// computes the size of the set of entries that have the actual reviewer within the top 5 computed reviewers and is shared between two algorithms by pairwise comparison /// </summary> public void FindIntersectingEntriesPairwiseForActualReviewerIds(AbstractSourceOfBugs source) { throw new NotImplementedException("This must be reimplemented to reflect the changes to StatisticsResult"); //IEnumerable<int> actualReviewerIds = source.findBugs(); //List<int> algorithmIds; //using (var context = new ExpertiseDBEntities()) // algorithmIds = context.Algorithms.Select(a => a.AlgorithmId).ToList(); //log.Debug("Setting up"); //var sb = new StringBuilder(); //List<List<StatisticsResult>> allStatistics = algorithmIds // .Select(algorithmId => File.ReadAllLines(string.Format(basepath + "stats_{0}.txt", algorithmId))) // .Select(originalData => originalData.Select(StatisticsResult.FromCSVLine) // .ToList()) // .ToList(); //allStatistics[0] = allStatistics[0].Where(stat => stat.AuthorWasExpertNo == 1).ToList(); //log.Info("Setup complete"); //for (int i = 0; i < algorithmIds.Count; i++) //{ // List<StatisticsResult> workingSet = allStatistics[i].Where(tmp => actualReviewerIds.Contains(tmp.ActualReviewerId) && tmp.AuthorWasFound).ToList(); // int count = workingSet.Count; // for (int j = 0; j < algorithmIds.Count; j++) // { // if (j == i) // continue; // List<StatisticsResult> result = workingSet.Where(s => allStatistics[j].Any(stats => stats.ActualReviewerId == s.ActualReviewerId && stats.AuthorWasFound)).ToList(); // sb.AppendLine(string.Format("{0} / {1} ({2:P}) intersecting entries for A{3} and A{4}", result.Count, count, (double)result.Count / (double)count, i + 1, j + 1)); // } //} //File.WriteAllText(string.Format(basepath + "stats_intersect_pairwise{0}.txt", source.Postfix), sb.ToString()); }
/// <summary> /// Iterates through the list of bugs and checks for each bug which algorithms suggested correct reviewers and which did not. /// Results are written to stats_x.txt: A CSV with bugId and correctly predicted reviewers in each entry (StatisticsResult strings) /// </summary> public void AnalyzeActualReviews(AbstractSourceOfBugs sourceOfActualReviewers) { IEnumerable <int> allBugIds = sourceOfActualReviewers.findBugs(); int someRandomBugId = allBugIds.First(); List <int> algorithmIds; using (ExpertiseDBEntities context = new ExpertiseDBEntities()) algorithmIds = context.Algorithms .Select(a => a.AlgorithmId) .Where(algoId => context.ComputedReviewers.Any(cr => cr.BugId == someRandomBugId && cr.AlgorithmId == algoId)) // filter algorithms for which no calculation has been done .ToList(); int count = 0; int errorCount = 0; double elapsed = 0d; int maxCount = allBugIds.Count(); Stopwatch sw = new Stopwatch(); sw.Start(); IDictionary <int, List <StatisticsResult> > output = algorithmIds.ToDictionary(algorithmId => algorithmId, algorithmId => new List <StatisticsResult>()); foreach (int bugId in allBugIds) { if (++count % 1000 == 0 && count > 0) { sw.Stop(); elapsed += sw.Elapsed.TotalSeconds; double avg = elapsed / count; TimeSpan remaining = TimeSpan.FromSeconds(avg * (maxCount - count)); log.DebugFormat("Now at: {0} - (act: {1} | avg: {2:N}s | remaining: {3})", count, sw.Elapsed, avg, remaining); sw.Restart(); } try { List <int> actualReviewerIds; using (ExpertiseDBEntities context = new ExpertiseDBEntities()) actualReviewerIds = context.ActualReviewers .Where(ar => ar.BugId == bugId) .Select(ar => context.Developers.FirstOrDefault(dev => dev.Name == ar.Reviewer && dev.RepositoryId == RepositoryId).DeveloperId) .ToList(); Debug.Assert(actualReviewerIds.Count > 0); // All bugs must have reviewers foreach (int algorithmId in algorithmIds) { output[algorithmId].Add(CalculateResultForOneAlgorithmAndBug(algorithmId, bugId, actualReviewerIds)); } } catch (Exception ex) { if (++errorCount > 10) { log.Fatal("10 errors while computing statistics, BugID=" + bugId + ", giving up.", ex); throw new Exception("10 errors while computing statistics, giving up.", ex); } log.Error("Error #" + errorCount + " on BugID " + bugId, ex); } } foreach (int algorithmId in output.Keys) { List <StatisticsResult> algoStats = output[algorithmId]; var sb = new StringBuilder(); foreach (StatisticsResult statisticsResult in algoStats) { sb.AppendLine(statisticsResult.ToCSV()); } File.WriteAllText(string.Format(basepath + "stats_{0}.txt", algorithmId), sb.ToString()); } }