private async Task LoadEventsAsync() { try { foreach (var name in await _store.ListAsync()) { var payload = await _store.LoadAsync(name); var headers = payload.Headers.ToDictionary(kv => kv.Key, kv => new StringValues(kv.Value.ToArray())); var body = payload.Body; var message = GitHubEventMessage.Parse(headers, body); _processingService.Enqueue(message); } } catch (Exception ex) { _logger.LogError(ex, "Couldn't load stored events"); } }
private static async Task RunAsync(CrawledSubscriptionList subscriptionList, bool reindex, bool pullLatest, bool randomReindex, bool uploadToAzure, string startingRepoName, string outputPath) { var reindexIntervalInDays = 28; var today = DateTime.Today; var connectionString = GetAzureStorageConnectionString(); // TODO: We should avoid having to use a temp directory var tempDirectory = Path.Combine(Path.GetTempPath(), "ghcrawler"); if (Directory.Exists(tempDirectory)) { Directory.Delete(tempDirectory, recursive: true); } Directory.CreateDirectory(tempDirectory); var cacheContainerName = "cache"; var cacheContainerClient = new BlobContainerClient(connectionString, cacheContainerName); if (!reindex || startingRepoName is not null) { var startingBlobName = $"{startingRepoName}.crcache"; var reachedStartingBlob = false; await foreach (var blob in cacheContainerClient.GetBlobsAsync()) { if (!subscriptionList.Contains(blob.Name.Replace(".crcache", ""))) { continue; } if (blob.Name == startingBlobName) { reachedStartingBlob = true; } if (reachedStartingBlob) { continue; } Console.WriteLine($"Downloading {blob.Name}..."); var localPath = Path.Combine(tempDirectory, blob.Name); var localDirectory = Path.GetDirectoryName(localPath); Directory.CreateDirectory(localDirectory); var blobClient = new BlobClient(connectionString, cacheContainerName, blob.Name); await blobClient.DownloadToAsync(localPath); } } var factory = CreateGitHubClientFactory(); var client = await factory.CreateAsync(); var jsonOptions = new JsonSerializerOptions() { WriteIndented = true }; var repos = new List <CrawledRepo>(); var reachedStartingRepo = reindex && startingRepoName is null; foreach (var org in subscriptionList.Orgs) { var orgDirectory = Path.Join(tempDirectory, org); Directory.CreateDirectory(orgDirectory); var existingRepos = Directory.GetFiles(orgDirectory, "*.crcache") .Select(p => Path.GetFileNameWithoutExtension(p)); if (!pullLatest) { Console.WriteLine($"Loading repos for {org}..."); foreach (var repoName in existingRepos) { var blobName = $"{repoName}.crcache"; var repoPath = Path.Join(orgDirectory, blobName); var repo = await CrawledRepo.LoadAsync(repoPath); if (repo is not null) { repos.Add(repo); } } } else { Console.WriteLine($"Requesting repos for {org}..."); var availableRepos = await RequestReposAsync(factory, client, org); var deletedRepos = existingRepos.ToHashSet(StringComparer.OrdinalIgnoreCase); deletedRepos.ExceptWith(availableRepos.Select(r => r.Name)); foreach (var deletedRepo in deletedRepos) { var blobName = $"{org}/{deletedRepo}.crcache"; var repoPath = Path.Join(tempDirectory, blobName); Console.WriteLine($"Deleting local file {blobName}..."); File.Delete(repoPath); if (uploadToAzure) { Console.WriteLine($"Deleting Azure blob {blobName}..."); await cacheContainerClient.DeleteBlobAsync(blobName); } } foreach (var repo in availableRepos) { if (!subscriptionList.Contains(org, repo.Name)) { continue; } var blobName = $"{org}/{repo.Name}.crcache"; var repoPath = Path.Join(tempDirectory, blobName); if (string.Equals($"{org}/{repo.Name}", startingRepoName, StringComparison.OrdinalIgnoreCase)) { reachedStartingRepo = true; } CrawledRepo crawledRepo; try { crawledRepo = await CrawledRepo.LoadAsync(repoPath); } catch (JsonException) { Console.WriteLine($"WARNING: Couldn't parse {blobName}"); crawledRepo = null; } if (crawledRepo is null) { crawledRepo = new CrawledRepo { Id = repo.Id, Org = org, Name = repo.Name }; } crawledRepo.IsArchived = repo.Archived; crawledRepo.Size = repo.Size; repos.Add(crawledRepo); var repoIsDueForReindexing = crawledRepo.LastReindex is null || crawledRepo.LastReindex?.AddDays(reindexIntervalInDays) <= today; if (reachedStartingRepo) { Console.WriteLine($"Marking {repo.FullName} to be re-indexed because we reached the starting repo {startingRepoName}."); } if (repoIsDueForReindexing) { if (crawledRepo.LastReindex is null) { Console.WriteLine($"Marking {repo.FullName} to be re-indexed because it was never fully indexed."); } else { Console.WriteLine($"Marking {repo.FullName} to be re-indexed because it was more than {reindexIntervalInDays} days ago, on {crawledRepo.LastReindex}."); } } if (reachedStartingRepo || repoIsDueForReindexing) { crawledRepo.Clear(); } } } } // We want to ensure that all repos are fully-reindexed at least once every four weeks. // That means we need to reindex at least #Repos / 28 per day. // // On top of that, we need to ensure that all repos which were never fully indexed (e.g. // they are new or were forced to be reindexed) are also reindexed. if (randomReindex) { var reposThatNeedReindexing = repos.Where(r => r.LastReindex is null).ToHashSet(); var minimumNumberOfReposToBeReindexed = (int)Math.Ceiling(repos.Count / (float)reindexIntervalInDays); var numberOfReposThatNeedReindexing = reposThatNeedReindexing.Count; if (numberOfReposThatNeedReindexing < minimumNumberOfReposToBeReindexed) { // OK, there are fewer repos that need reindexing than what we want to reindex // per day. So let's randomly pick some repos to reindex. var remainingRepos = repos.Except(reposThatNeedReindexing).ToList(); var choiceCount = minimumNumberOfReposToBeReindexed - numberOfReposThatNeedReindexing; var random = new Random(); for (var choice = 0; choice < choiceCount; choice++) { var i = random.Next(0, remainingRepos.Count); var repo = remainingRepos[i]; Console.WriteLine($"Marking {repo.FullName} to be re-indexed because it was randomly chosen."); repo.Clear(); reposThatNeedReindexing.Add(repo); remainingRepos.RemoveAt(i); } } } if (pullLatest) { Console.WriteLine($"Listing events..."); var eventStore = new GitHubEventStore(connectionString); var events = await eventStore.ListAsync(); Console.WriteLine($"Crawling {repos.Count:N0} repos, fully reindexing {repos.Count(r => r.LastReindex is null):N0} repos..."); foreach (var crawledRepo in repos) { var blobName = $"{crawledRepo.FullName}.crcache"; var repoPath = Path.Join(tempDirectory, blobName); var since = crawledRepo.IncrementalUpdateStart; var messages = new List <GitHubEventMessage>(); if (since is null) { Console.WriteLine($"Crawling {crawledRepo.FullName}..."); } else { var toBeDownloaded = events.Where(n => string.Equals(n.Org, crawledRepo.Org, StringComparison.OrdinalIgnoreCase) && string.Equals(n.Repo, crawledRepo.Name, StringComparison.OrdinalIgnoreCase)) .ToArray(); if (toBeDownloaded.Any()) { Console.WriteLine($"Loading {toBeDownloaded.Length:N0} events for {crawledRepo.FullName}..."); var i = 0; var lastPercent = 0; foreach (var name in toBeDownloaded) { var percent = (int)Math.Ceiling((float)i / toBeDownloaded.Length * 100); i++; if (percent % 10 == 0) { if (percent != lastPercent) { Console.Write($"{percent}%..."); } lastPercent = percent; } var payload = await eventStore.LoadAsync(name); var headers = payload.Headers.ToDictionary(kv => kv.Key, kv => new StringValues(kv.Value.ToArray())); var body = payload.Body; var message = GitHubEventMessage.Parse(headers, body); messages.Add(message); } Console.WriteLine("done."); } Console.WriteLine($"Crawling {crawledRepo.FullName} since {since}..."); } if (crawledRepo.LastReindex is null) { crawledRepo.LastReindex = DateTimeOffset.UtcNow; } crawledRepo.AreaOwners = await GetAreaOwnersAsync(crawledRepo.Org, crawledRepo.Name); var currentLabels = await RequestLabelsAsync(factory, client, crawledRepo.Org, crawledRepo.Name); SyncLabels(crawledRepo, currentLabels, out var labelById); var currentMilestones = await RequestMilestonesAsync(factory, client, crawledRepo.Org, crawledRepo.Name); SyncMilestones(crawledRepo, currentMilestones, out var milestoneById); // NOTE: GitHub's Issues.GetAllForeRepository() doesn't include issues that were transferred // // That's the good part. The bad part is that for the new repository where // it shows up, we have no way of knowing which repo it came from and which // number it used to have (even when looking at the issues timeline data), // so we can't remove the issue from the source repo. // // However, since we're persisting GitHub events we received, we'll can look // up which issues were transferred and remove them from the repo. This avoids // having to wait until we fully reindex the repo. // // Note, we remove transferred issues before pulling issues in case the issues // were being transferred back; it seems GitHub is reusing the numbers in that // case. foreach (var message in messages.Where(m => m.Body.Action == "transferred")) { Console.WriteLine($"Removing {message.Body?.Repository?.FullName}#{message.Body?.Issue?.Number}: {message.Body?.Issue?.Title}"); var number = message.Body?.Issue?.Number; if (number is not null) { crawledRepo.Issues.Remove(number.Value); } } foreach (var issue in await RequestIssuesAsync(factory, client, crawledRepo.Org, crawledRepo.Name, since)) { var crawledIssue = ConvertIssue(crawledRepo, issue, labelById, milestoneById); crawledRepo.Issues[issue.Number] = crawledIssue; } foreach (var pullRequest in await RequestPullRequestsAsync(factory, client, crawledRepo.Org, crawledRepo.Name, since)) { if (crawledRepo.Issues.TryGetValue(pullRequest.Number, out var issue)) { UpdateIssue(issue, pullRequest); } // TODO: Get PR reviews // TODO: Get PR commits // TODO: Get PR status } await crawledRepo.SaveAsync(repoPath); if (uploadToAzure) { Console.WriteLine($"Uploading {blobName} to Azure..."); var repoClient = new BlobClient(connectionString, cacheContainerName, blobName); await repoClient.UploadAsync(repoPath, overwrite : true); if (since is null) { var eventsToBeDeleted = events.Where(e => string.Equals($"{e.Org}/{e.Repo}", crawledRepo.FullName, StringComparison.OrdinalIgnoreCase)) .ToArray(); Console.WriteLine($"Deleting {eventsToBeDeleted.Length:N0} events for {crawledRepo.FullName}..."); foreach (var e in eventsToBeDeleted) { await eventStore.DeleteAsync(e); } } } } } foreach (var repo in repos) { var milestones = repo.Milestones.ToHashSet(); var labels = repo.Labels.ToHashSet(); foreach (var issue in repo.Issues.Values) { foreach (var label in issue.Labels.Where(l => !labels.Contains(l))) { Console.Error.WriteLine($"error: {repo.FullName}#{issue.Number}: label '{label.Name}' doesn't exist"); } if (issue.Milestone is not null && !milestones.Contains(issue.Milestone)) { Console.Error.WriteLine($"error: {repo.FullName}#{issue.Number}: milestone '{issue.Milestone.Title}' doesn't exist"); } } } Console.WriteLine("Creating trie..."); var trie = new CrawledTrie <CrawledIssue>(); foreach (var repo in repos) { foreach (var issue in repo.Issues.Values) { trie.Add(issue); } } Console.WriteLine("Creating index..."); var index = new CrawledIndex() { Repos = repos.ToList(), Trie = trie }; var indexName = "index.cicache"; var indexPath = string.IsNullOrEmpty(outputPath) ? Path.Join(tempDirectory, indexName) : outputPath; await index.SaveAsync(indexPath); if (uploadToAzure) { Console.WriteLine("Uploading index to Azure..."); var indexClient = new BlobClient(connectionString, "index", indexName); await indexClient.UploadAsync(indexPath, overwrite : true); } Console.WriteLine("Deleting temp files..."); Directory.Delete(tempDirectory, recursive: true); }