private void AddRepo(GitHubEventRepository repository) { if (repository.Private) { return; } var index = _indexService.Index; if (index is null) { return; } var crawledRepo = new CrawledRepo(); crawledRepo.Id = repository.Id; crawledRepo.Size = repository.Size; UpdateRepo(repository, crawledRepo); index.Repos = index.Repos.CopyAndAdd(crawledRepo); _indexService.NotifyIndexChanged(); }
private static CrawledIssue ConvertIssue(CrawledRepo repo, Issue issue, Dictionary <long, CrawledLabel> labels, Dictionary <long, CrawledMilestone> milestones) { return(new CrawledIssue { Id = issue.Id, Repo = repo, Number = issue.Number, IsOpen = ConvertIssueState(issue.State), Title = issue.Title, Body = issue.Body, CreatedAt = issue.CreatedAt, UpdatedAt = issue.UpdatedAt, ClosedAt = issue.ClosedAt, CreatedBy = issue.User.Login, Assignees = ConvertUsers(issue.Assignees), Labels = GetLabels(issue.Labels, labels), Milestone = GetMilestone(issue.Milestone, milestones), IsLocked = issue.Locked, Comments = issue.Comments, ReactionsPlus1 = issue.Reactions?.Plus1 ?? 0, ReactionsMinus1 = issue.Reactions?.Minus1 ?? 0, ReactionsSmile = issue.Reactions?.Laugh ?? 0, ReactionsTada = issue.Reactions?.Hooray ?? 0, ReactionsThinkingFace = issue.Reactions?.Confused ?? 0, ReactionsHeart = issue.Reactions?.Heart ?? 0 // TODO: RocketShip and Eyes are missing }); }
private void RemoveIssue(CrawledRepo crawledRepo, CrawledIssue crawledIssue) { var oldTrieTerms = crawledIssue.GetTrieTerms(); crawledRepo.Issues.Remove(crawledIssue.Number); RemoveTrieTerms(crawledIssue, oldTrieTerms); }
private static void SyncLabels(CrawledRepo crawledRepo, IReadOnlyList <Label> gitHubLabels, out Dictionary <long, CrawledLabel> labelById) { // TODO: This logic feels similar to what we do in the web site. Should we reconcile this? var crawledLabelById = crawledRepo.Labels.ToDictionary(l => l.Id); var gitHubLabelById = gitHubLabels.ToDictionary(l => l.Id); foreach (var gitHubLabel in gitHubLabels) { if (crawledLabelById.TryGetValue(gitHubLabel.Id, out var crawledLabel)) { // Update crawledLabel.Name = gitHubLabel.Name; crawledLabel.Description = gitHubLabel.Description; crawledLabel.ColorText = gitHubLabel.Color; } else { // Create crawledLabel = ConvertLabel(gitHubLabel); crawledRepo.Labels.Add(crawledLabel); crawledLabelById.Add(crawledLabel.Id, crawledLabel); } } // Delete var toBeDeleted = crawledRepo.Labels.Where(l => !gitHubLabelById.ContainsKey(l.Id)) .ToArray(); foreach (var crawledLabel in toBeDeleted) { crawledRepo.Labels.Remove(crawledLabel); foreach (var issue in crawledRepo.Issues.Values) { if (issue.Labels.Contains(crawledLabel)) { var newLabels = issue.Labels.ToList(); newLabels.Remove(crawledLabel); issue.Labels = newLabels.ToArray(); } } } // Fix labels foreach (var issue in crawledRepo.Issues.Values) { for (var i = 0; i < issue.Labels.Length; i++) { issue.Labels[i] = crawledLabelById[issue.Labels[i].Id]; } } labelById = crawledLabelById; }
private static void SyncMilestones(CrawledRepo crawledRepo, IReadOnlyList <Milestone> gitHubMilestones, out Dictionary <long, CrawledMilestone> milestoneById) { // TODO: This logic feels similar to what we do in the web site. Should we reconcile this? var crawledMilestoneById = crawledRepo.Milestones.ToDictionary(l => l.Id); var gitHubMilestoneById = gitHubMilestones.ToDictionary(l => l.Id); foreach (var gitHubMilestone in gitHubMilestones) { if (crawledMilestoneById.TryGetValue(gitHubMilestone.Id, out var crawledMilestone)) { // Update crawledMilestone.Title = gitHubMilestone.Title; crawledMilestone.Description = gitHubMilestone.Description; crawledMilestone.Number = gitHubMilestone.Number; } else { // Create crawledMilestone = ConvertMilestone(gitHubMilestone); crawledRepo.Milestones.Add(crawledMilestone); crawledMilestoneById.Add(crawledMilestone.Id, crawledMilestone); } } // Delete var toBeDeleted = crawledRepo.Milestones.Where(l => !gitHubMilestoneById.ContainsKey(l.Id)) .ToArray(); foreach (var crawledMilestone in toBeDeleted) { crawledRepo.Milestones.Remove(crawledMilestone); foreach (var issue in crawledRepo.Issues.Values) { if (issue.Milestone == crawledMilestone) { issue.Milestone = null; } } } // Fix milestones foreach (var issue in crawledRepo.Issues.Values) { if (issue.Milestone is not null) { issue.Milestone = crawledMilestoneById[issue.Milestone.Id]; } } milestoneById = crawledMilestoneById; }
private static CrawledLabel CreateLabel(CrawledRepo crawledRepo, GitHubEventLabel label) { var crawledLabel = new CrawledLabel(); crawledLabel.Id = label.Id; UpdateLabel(label, crawledLabel); crawledRepo.Labels = crawledRepo.Labels.CopyAndAdd(crawledLabel); return(crawledLabel); }
private static CrawledMilestone CreateMilestone(CrawledRepo crawledRepo, GitHubEventMilestone milestone) { var crawledMilestone = new CrawledMilestone(); crawledMilestone.Id = milestone.Id; crawledMilestone.Number = milestone.Number; UpdateMilestone(milestone, crawledMilestone); crawledRepo.Milestones = crawledRepo.Milestones.CopyAndAdd(crawledMilestone); return(crawledMilestone); }
private static async Task RunAsync(CrawledSubscriptionList subscriptionList, bool reindex, bool pullLatest, bool randomReindex, bool uploadToAzure, string startingRepoName, string outputPath) { var reindexIntervalInDays = 28; var today = DateTime.Today; var connectionString = GetAzureStorageConnectionString(); // TODO: We should avoid having to use a temp directory var tempDirectory = Path.Combine(Path.GetTempPath(), "ghcrawler"); if (Directory.Exists(tempDirectory)) { Directory.Delete(tempDirectory, recursive: true); } Directory.CreateDirectory(tempDirectory); var cacheContainerName = "cache"; var cacheContainerClient = new BlobContainerClient(connectionString, cacheContainerName); if (!reindex || startingRepoName is not null) { var startingBlobName = $"{startingRepoName}.crcache"; var reachedStartingBlob = false; await foreach (var blob in cacheContainerClient.GetBlobsAsync()) { if (!subscriptionList.Contains(blob.Name.Replace(".crcache", ""))) { continue; } if (blob.Name == startingBlobName) { reachedStartingBlob = true; } if (reachedStartingBlob) { continue; } Console.WriteLine($"Downloading {blob.Name}..."); var localPath = Path.Combine(tempDirectory, blob.Name); var localDirectory = Path.GetDirectoryName(localPath); Directory.CreateDirectory(localDirectory); var blobClient = new BlobClient(connectionString, cacheContainerName, blob.Name); await blobClient.DownloadToAsync(localPath); } } var factory = CreateGitHubClientFactory(); var client = await factory.CreateAsync(); var jsonOptions = new JsonSerializerOptions() { WriteIndented = true }; var repos = new List <CrawledRepo>(); var reachedStartingRepo = reindex && startingRepoName is null; foreach (var org in subscriptionList.Orgs) { var orgDirectory = Path.Join(tempDirectory, org); Directory.CreateDirectory(orgDirectory); var existingRepos = Directory.GetFiles(orgDirectory, "*.crcache") .Select(p => Path.GetFileNameWithoutExtension(p)); if (!pullLatest) { Console.WriteLine($"Loading repos for {org}..."); foreach (var repoName in existingRepos) { var blobName = $"{repoName}.crcache"; var repoPath = Path.Join(orgDirectory, blobName); var repo = await CrawledRepo.LoadAsync(repoPath); if (repo is not null) { repos.Add(repo); } } } else { Console.WriteLine($"Requesting repos for {org}..."); var availableRepos = await RequestReposAsync(factory, client, org); var deletedRepos = existingRepos.ToHashSet(StringComparer.OrdinalIgnoreCase); deletedRepos.ExceptWith(availableRepos.Select(r => r.Name)); foreach (var deletedRepo in deletedRepos) { var blobName = $"{org}/{deletedRepo}.crcache"; var repoPath = Path.Join(tempDirectory, blobName); Console.WriteLine($"Deleting local file {blobName}..."); File.Delete(repoPath); if (uploadToAzure) { Console.WriteLine($"Deleting Azure blob {blobName}..."); await cacheContainerClient.DeleteBlobAsync(blobName); } } foreach (var repo in availableRepos) { if (!subscriptionList.Contains(org, repo.Name)) { continue; } var blobName = $"{org}/{repo.Name}.crcache"; var repoPath = Path.Join(tempDirectory, blobName); if (string.Equals($"{org}/{repo.Name}", startingRepoName, StringComparison.OrdinalIgnoreCase)) { reachedStartingRepo = true; } CrawledRepo crawledRepo; try { crawledRepo = await CrawledRepo.LoadAsync(repoPath); } catch (JsonException) { Console.WriteLine($"WARNING: Couldn't parse {blobName}"); crawledRepo = null; } if (crawledRepo is null) { crawledRepo = new CrawledRepo { Id = repo.Id, Org = org, Name = repo.Name }; } crawledRepo.IsArchived = repo.Archived; crawledRepo.Size = repo.Size; repos.Add(crawledRepo); var repoIsDueForReindexing = crawledRepo.LastReindex is null || crawledRepo.LastReindex?.AddDays(reindexIntervalInDays) <= today; if (reachedStartingRepo) { Console.WriteLine($"Marking {repo.FullName} to be re-indexed because we reached the starting repo {startingRepoName}."); } if (repoIsDueForReindexing) { if (crawledRepo.LastReindex is null) { Console.WriteLine($"Marking {repo.FullName} to be re-indexed because it was never fully indexed."); } else { Console.WriteLine($"Marking {repo.FullName} to be re-indexed because it was more than {reindexIntervalInDays} days ago, on {crawledRepo.LastReindex}."); } } if (reachedStartingRepo || repoIsDueForReindexing) { crawledRepo.Clear(); } } } } // We want to ensure that all repos are fully-reindexed at least once every four weeks. // That means we need to reindex at least #Repos / 28 per day. // // On top of that, we need to ensure that all repos which were never fully indexed (e.g. // they are new or were forced to be reindexed) are also reindexed. if (randomReindex) { var reposThatNeedReindexing = repos.Where(r => r.LastReindex is null).ToHashSet(); var minimumNumberOfReposToBeReindexed = (int)Math.Ceiling(repos.Count / (float)reindexIntervalInDays); var numberOfReposThatNeedReindexing = reposThatNeedReindexing.Count; if (numberOfReposThatNeedReindexing < minimumNumberOfReposToBeReindexed) { // OK, there are fewer repos that need reindexing than what we want to reindex // per day. So let's randomly pick some repos to reindex. var remainingRepos = repos.Except(reposThatNeedReindexing).ToList(); var choiceCount = minimumNumberOfReposToBeReindexed - numberOfReposThatNeedReindexing; var random = new Random(); for (var choice = 0; choice < choiceCount; choice++) { var i = random.Next(0, remainingRepos.Count); var repo = remainingRepos[i]; Console.WriteLine($"Marking {repo.FullName} to be re-indexed because it was randomly chosen."); repo.Clear(); reposThatNeedReindexing.Add(repo); remainingRepos.RemoveAt(i); } } } if (pullLatest) { Console.WriteLine($"Listing events..."); var eventStore = new GitHubEventStore(connectionString); var events = await eventStore.ListAsync(); Console.WriteLine($"Crawling {repos.Count:N0} repos, fully reindexing {repos.Count(r => r.LastReindex is null):N0} repos..."); foreach (var crawledRepo in repos) { var blobName = $"{crawledRepo.FullName}.crcache"; var repoPath = Path.Join(tempDirectory, blobName); var since = crawledRepo.IncrementalUpdateStart; var messages = new List <GitHubEventMessage>(); if (since is null) { Console.WriteLine($"Crawling {crawledRepo.FullName}..."); } else { var toBeDownloaded = events.Where(n => string.Equals(n.Org, crawledRepo.Org, StringComparison.OrdinalIgnoreCase) && string.Equals(n.Repo, crawledRepo.Name, StringComparison.OrdinalIgnoreCase)) .ToArray(); if (toBeDownloaded.Any()) { Console.WriteLine($"Loading {toBeDownloaded.Length:N0} events for {crawledRepo.FullName}..."); var i = 0; var lastPercent = 0; foreach (var name in toBeDownloaded) { var percent = (int)Math.Ceiling((float)i / toBeDownloaded.Length * 100); i++; if (percent % 10 == 0) { if (percent != lastPercent) { Console.Write($"{percent}%..."); } lastPercent = percent; } var payload = await eventStore.LoadAsync(name); var headers = payload.Headers.ToDictionary(kv => kv.Key, kv => new StringValues(kv.Value.ToArray())); var body = payload.Body; var message = GitHubEventMessage.Parse(headers, body); messages.Add(message); } Console.WriteLine("done."); } Console.WriteLine($"Crawling {crawledRepo.FullName} since {since}..."); } if (crawledRepo.LastReindex is null) { crawledRepo.LastReindex = DateTimeOffset.UtcNow; } crawledRepo.AreaOwners = await GetAreaOwnersAsync(crawledRepo.Org, crawledRepo.Name); var currentLabels = await RequestLabelsAsync(factory, client, crawledRepo.Org, crawledRepo.Name); SyncLabels(crawledRepo, currentLabels, out var labelById); var currentMilestones = await RequestMilestonesAsync(factory, client, crawledRepo.Org, crawledRepo.Name); SyncMilestones(crawledRepo, currentMilestones, out var milestoneById); // NOTE: GitHub's Issues.GetAllForeRepository() doesn't include issues that were transferred // // That's the good part. The bad part is that for the new repository where // it shows up, we have no way of knowing which repo it came from and which // number it used to have (even when looking at the issues timeline data), // so we can't remove the issue from the source repo. // // However, since we're persisting GitHub events we received, we'll can look // up which issues were transferred and remove them from the repo. This avoids // having to wait until we fully reindex the repo. // // Note, we remove transferred issues before pulling issues in case the issues // were being transferred back; it seems GitHub is reusing the numbers in that // case. foreach (var message in messages.Where(m => m.Body.Action == "transferred")) { Console.WriteLine($"Removing {message.Body?.Repository?.FullName}#{message.Body?.Issue?.Number}: {message.Body?.Issue?.Title}"); var number = message.Body?.Issue?.Number; if (number is not null) { crawledRepo.Issues.Remove(number.Value); } } foreach (var issue in await RequestIssuesAsync(factory, client, crawledRepo.Org, crawledRepo.Name, since)) { var crawledIssue = ConvertIssue(crawledRepo, issue, labelById, milestoneById); crawledRepo.Issues[issue.Number] = crawledIssue; } foreach (var pullRequest in await RequestPullRequestsAsync(factory, client, crawledRepo.Org, crawledRepo.Name, since)) { if (crawledRepo.Issues.TryGetValue(pullRequest.Number, out var issue)) { UpdateIssue(issue, pullRequest); } // TODO: Get PR reviews // TODO: Get PR commits // TODO: Get PR status } await crawledRepo.SaveAsync(repoPath); if (uploadToAzure) { Console.WriteLine($"Uploading {blobName} to Azure..."); var repoClient = new BlobClient(connectionString, cacheContainerName, blobName); await repoClient.UploadAsync(repoPath, overwrite : true); if (since is null) { var eventsToBeDeleted = events.Where(e => string.Equals($"{e.Org}/{e.Repo}", crawledRepo.FullName, StringComparison.OrdinalIgnoreCase)) .ToArray(); Console.WriteLine($"Deleting {eventsToBeDeleted.Length:N0} events for {crawledRepo.FullName}..."); foreach (var e in eventsToBeDeleted) { await eventStore.DeleteAsync(e); } } } } } foreach (var repo in repos) { var milestones = repo.Milestones.ToHashSet(); var labels = repo.Labels.ToHashSet(); foreach (var issue in repo.Issues.Values) { foreach (var label in issue.Labels.Where(l => !labels.Contains(l))) { Console.Error.WriteLine($"error: {repo.FullName}#{issue.Number}: label '{label.Name}' doesn't exist"); } if (issue.Milestone is not null && !milestones.Contains(issue.Milestone)) { Console.Error.WriteLine($"error: {repo.FullName}#{issue.Number}: milestone '{issue.Milestone.Title}' doesn't exist"); } } } Console.WriteLine("Creating trie..."); var trie = new CrawledTrie <CrawledIssue>(); foreach (var repo in repos) { foreach (var issue in repo.Issues.Values) { trie.Add(issue); } } Console.WriteLine("Creating index..."); var index = new CrawledIndex() { Repos = repos.ToList(), Trie = trie }; var indexName = "index.cicache"; var indexPath = string.IsNullOrEmpty(outputPath) ? Path.Join(tempDirectory, indexName) : outputPath; await index.SaveAsync(indexPath); if (uploadToAzure) { Console.WriteLine("Uploading index to Azure..."); var indexClient = new BlobClient(connectionString, "index", indexName); await indexClient.UploadAsync(indexPath, overwrite : true); } Console.WriteLine("Deleting temp files..."); Directory.Delete(tempDirectory, recursive: true); }
private static void UpdateRepo(GitHubEventRepository repository, CrawledRepo crawledRepo) { crawledRepo.Org = repository.Owner.Login; crawledRepo.Name = repository.Name; crawledRepo.IsArchived = repository.Archived; }