private static async Task <int> Main(string[] args) { //args = new[] //{ // "--subscriptions", // //"dotnet/runtime", // //"--reindex", // //"--starting-repo", // //"dotnet/docfx", // "--no-pull-latest", // "--no-upload", // "--out", // @"P:\issuesof.net\src\issuesof.net\bin\Debug\net5.0\index.cicache", //}; var appName = Path.GetFileNameWithoutExtension(Environment.GetCommandLineArgs()[0]); var repoSpecs = new List <string>(); var outputPath = ""; var reindex = false; var pullLatest = true; var randomReindex = true; var uploadToAzure = true; var startingRepoName = (string)null; var help = args.Length == 0; var useSubscriptions = false; var options = new OptionSet { $"usage: {appName} <repo-spec>... [OPTIONS]+", $"", $"Examples:", $" {appName} dotnet aspnet", $"", $" Indexes all public repos in the dotnet and aspnet orgs", $"", $" {appName} dotnet microsoft/CsWinRT microsoft/MSBuild", $"", $" Indexes all public repos in the dotnet orgs and the CsWinRT", $" and MSBuild repos in the Microsoft org.", $"", $"<repo-spec> can be of the following forms:", $" owner Indexes all public repos of the owner", $" owner/repo Indexes only the one repo, if it is public", $"", $"Options:", { "subscriptions", "Indicates whether to use the built-in subscriptions or not", v => useSubscriptions = true }, { "out=", "The output {path} the index should be written to", v => outputPath = v }, { "reindex", "Specifies that the repo should be reindexed", v => reindex = true }, { "starting-repo=", "The starting {repo} to re-index", v => startingRepoName = v }, { "no-pull-latest", null, v => pullLatest = false, true }, { "no-random-reindex", null, v => randomReindex = false, true }, { "no-upload", null, v => uploadToAzure = false, true }, { "h|?|help", null, v => help = true, true }, new ResponseFileSource() }; try { var parameters = options.Parse(args).ToArray(); if (help) { options.WriteOptionDescriptions(Console.Error); return(0); } var unprocessed = new List <string>(); foreach (var parameter in parameters) { if (!char.IsLetter(parameter[0])) { unprocessed.Add(parameter); } else { repoSpecs.Add(parameter); } } if (unprocessed.Any()) { foreach (var option in unprocessed) { Console.Error.WriteLine($"error: unrecognized argument {option}"); } return(1); } } catch (Exception ex) { Console.Error.WriteLine(ex.ToString()); return(1); } var subscriptionList = useSubscriptions ? CrawledSubscriptionList.CreateDefault() : new CrawledSubscriptionList(); foreach (var repoSpec in repoSpecs) { subscriptionList.Add(repoSpec); } if (reindex && !pullLatest) { Console.Error.WriteLine($"error: --reindex can't be combined with --no-pull-latest"); return(1); } if (startingRepoName is not null && !reindex) { Console.Error.WriteLine($"error: --starting-repo can't be used unless --reindex is specified"); return(1); } try { await RunAsync(subscriptionList, reindex, pullLatest, randomReindex, uploadToAzure, startingRepoName, outputPath); return(0); } catch (Exception ex) when(!Debugger.IsAttached) { Console.WriteLine($"fatal: {ex}"); return(1); } }
private static async Task RunAsync(CrawledSubscriptionList subscriptionList, bool reindex, bool pullLatest, bool randomReindex, bool uploadToAzure, string startingRepoName, string outputPath) { var reindexIntervalInDays = 28; var today = DateTime.Today; var connectionString = GetAzureStorageConnectionString(); // TODO: We should avoid having to use a temp directory var tempDirectory = Path.Combine(Path.GetTempPath(), "ghcrawler"); if (Directory.Exists(tempDirectory)) { Directory.Delete(tempDirectory, recursive: true); } Directory.CreateDirectory(tempDirectory); var cacheContainerName = "cache"; var cacheContainerClient = new BlobContainerClient(connectionString, cacheContainerName); if (!reindex || startingRepoName is not null) { var startingBlobName = $"{startingRepoName}.crcache"; var reachedStartingBlob = false; await foreach (var blob in cacheContainerClient.GetBlobsAsync()) { if (!subscriptionList.Contains(blob.Name.Replace(".crcache", ""))) { continue; } if (blob.Name == startingBlobName) { reachedStartingBlob = true; } if (reachedStartingBlob) { continue; } Console.WriteLine($"Downloading {blob.Name}..."); var localPath = Path.Combine(tempDirectory, blob.Name); var localDirectory = Path.GetDirectoryName(localPath); Directory.CreateDirectory(localDirectory); var blobClient = new BlobClient(connectionString, cacheContainerName, blob.Name); await blobClient.DownloadToAsync(localPath); } } var factory = CreateGitHubClientFactory(); var client = await factory.CreateAsync(); var jsonOptions = new JsonSerializerOptions() { WriteIndented = true }; var repos = new List <CrawledRepo>(); var reachedStartingRepo = reindex && startingRepoName is null; foreach (var org in subscriptionList.Orgs) { var orgDirectory = Path.Join(tempDirectory, org); Directory.CreateDirectory(orgDirectory); var existingRepos = Directory.GetFiles(orgDirectory, "*.crcache") .Select(p => Path.GetFileNameWithoutExtension(p)); if (!pullLatest) { Console.WriteLine($"Loading repos for {org}..."); foreach (var repoName in existingRepos) { var blobName = $"{repoName}.crcache"; var repoPath = Path.Join(orgDirectory, blobName); var repo = await CrawledRepo.LoadAsync(repoPath); if (repo is not null) { repos.Add(repo); } } } else { Console.WriteLine($"Requesting repos for {org}..."); var availableRepos = await RequestReposAsync(factory, client, org); var deletedRepos = existingRepos.ToHashSet(StringComparer.OrdinalIgnoreCase); deletedRepos.ExceptWith(availableRepos.Select(r => r.Name)); foreach (var deletedRepo in deletedRepos) { var blobName = $"{org}/{deletedRepo}.crcache"; var repoPath = Path.Join(tempDirectory, blobName); Console.WriteLine($"Deleting local file {blobName}..."); File.Delete(repoPath); if (uploadToAzure) { Console.WriteLine($"Deleting Azure blob {blobName}..."); await cacheContainerClient.DeleteBlobAsync(blobName); } } foreach (var repo in availableRepos) { if (!subscriptionList.Contains(org, repo.Name)) { continue; } var blobName = $"{org}/{repo.Name}.crcache"; var repoPath = Path.Join(tempDirectory, blobName); if (string.Equals($"{org}/{repo.Name}", startingRepoName, StringComparison.OrdinalIgnoreCase)) { reachedStartingRepo = true; } CrawledRepo crawledRepo; try { crawledRepo = await CrawledRepo.LoadAsync(repoPath); } catch (JsonException) { Console.WriteLine($"WARNING: Couldn't parse {blobName}"); crawledRepo = null; } if (crawledRepo is null) { crawledRepo = new CrawledRepo { Id = repo.Id, Org = org, Name = repo.Name }; } crawledRepo.IsArchived = repo.Archived; crawledRepo.Size = repo.Size; repos.Add(crawledRepo); var repoIsDueForReindexing = crawledRepo.LastReindex is null || crawledRepo.LastReindex?.AddDays(reindexIntervalInDays) <= today; if (reachedStartingRepo) { Console.WriteLine($"Marking {repo.FullName} to be re-indexed because we reached the starting repo {startingRepoName}."); } if (repoIsDueForReindexing) { if (crawledRepo.LastReindex is null) { Console.WriteLine($"Marking {repo.FullName} to be re-indexed because it was never fully indexed."); } else { Console.WriteLine($"Marking {repo.FullName} to be re-indexed because it was more than {reindexIntervalInDays} days ago, on {crawledRepo.LastReindex}."); } } if (reachedStartingRepo || repoIsDueForReindexing) { crawledRepo.Clear(); } } } } // We want to ensure that all repos are fully-reindexed at least once every four weeks. // That means we need to reindex at least #Repos / 28 per day. // // On top of that, we need to ensure that all repos which were never fully indexed (e.g. // they are new or were forced to be reindexed) are also reindexed. if (randomReindex) { var reposThatNeedReindexing = repos.Where(r => r.LastReindex is null).ToHashSet(); var minimumNumberOfReposToBeReindexed = (int)Math.Ceiling(repos.Count / (float)reindexIntervalInDays); var numberOfReposThatNeedReindexing = reposThatNeedReindexing.Count; if (numberOfReposThatNeedReindexing < minimumNumberOfReposToBeReindexed) { // OK, there are fewer repos that need reindexing than what we want to reindex // per day. So let's randomly pick some repos to reindex. var remainingRepos = repos.Except(reposThatNeedReindexing).ToList(); var choiceCount = minimumNumberOfReposToBeReindexed - numberOfReposThatNeedReindexing; var random = new Random(); for (var choice = 0; choice < choiceCount; choice++) { var i = random.Next(0, remainingRepos.Count); var repo = remainingRepos[i]; Console.WriteLine($"Marking {repo.FullName} to be re-indexed because it was randomly chosen."); repo.Clear(); reposThatNeedReindexing.Add(repo); remainingRepos.RemoveAt(i); } } } if (pullLatest) { Console.WriteLine($"Listing events..."); var eventStore = new GitHubEventStore(connectionString); var events = await eventStore.ListAsync(); Console.WriteLine($"Crawling {repos.Count:N0} repos, fully reindexing {repos.Count(r => r.LastReindex is null):N0} repos..."); foreach (var crawledRepo in repos) { var blobName = $"{crawledRepo.FullName}.crcache"; var repoPath = Path.Join(tempDirectory, blobName); var since = crawledRepo.IncrementalUpdateStart; var messages = new List <GitHubEventMessage>(); if (since is null) { Console.WriteLine($"Crawling {crawledRepo.FullName}..."); } else { var toBeDownloaded = events.Where(n => string.Equals(n.Org, crawledRepo.Org, StringComparison.OrdinalIgnoreCase) && string.Equals(n.Repo, crawledRepo.Name, StringComparison.OrdinalIgnoreCase)) .ToArray(); if (toBeDownloaded.Any()) { Console.WriteLine($"Loading {toBeDownloaded.Length:N0} events for {crawledRepo.FullName}..."); var i = 0; var lastPercent = 0; foreach (var name in toBeDownloaded) { var percent = (int)Math.Ceiling((float)i / toBeDownloaded.Length * 100); i++; if (percent % 10 == 0) { if (percent != lastPercent) { Console.Write($"{percent}%..."); } lastPercent = percent; } var payload = await eventStore.LoadAsync(name); var headers = payload.Headers.ToDictionary(kv => kv.Key, kv => new StringValues(kv.Value.ToArray())); var body = payload.Body; var message = GitHubEventMessage.Parse(headers, body); messages.Add(message); } Console.WriteLine("done."); } Console.WriteLine($"Crawling {crawledRepo.FullName} since {since}..."); } if (crawledRepo.LastReindex is null) { crawledRepo.LastReindex = DateTimeOffset.UtcNow; } crawledRepo.AreaOwners = await GetAreaOwnersAsync(crawledRepo.Org, crawledRepo.Name); var currentLabels = await RequestLabelsAsync(factory, client, crawledRepo.Org, crawledRepo.Name); SyncLabels(crawledRepo, currentLabels, out var labelById); var currentMilestones = await RequestMilestonesAsync(factory, client, crawledRepo.Org, crawledRepo.Name); SyncMilestones(crawledRepo, currentMilestones, out var milestoneById); // NOTE: GitHub's Issues.GetAllForeRepository() doesn't include issues that were transferred // // That's the good part. The bad part is that for the new repository where // it shows up, we have no way of knowing which repo it came from and which // number it used to have (even when looking at the issues timeline data), // so we can't remove the issue from the source repo. // // However, since we're persisting GitHub events we received, we'll can look // up which issues were transferred and remove them from the repo. This avoids // having to wait until we fully reindex the repo. // // Note, we remove transferred issues before pulling issues in case the issues // were being transferred back; it seems GitHub is reusing the numbers in that // case. foreach (var message in messages.Where(m => m.Body.Action == "transferred")) { Console.WriteLine($"Removing {message.Body?.Repository?.FullName}#{message.Body?.Issue?.Number}: {message.Body?.Issue?.Title}"); var number = message.Body?.Issue?.Number; if (number is not null) { crawledRepo.Issues.Remove(number.Value); } } foreach (var issue in await RequestIssuesAsync(factory, client, crawledRepo.Org, crawledRepo.Name, since)) { var crawledIssue = ConvertIssue(crawledRepo, issue, labelById, milestoneById); crawledRepo.Issues[issue.Number] = crawledIssue; } foreach (var pullRequest in await RequestPullRequestsAsync(factory, client, crawledRepo.Org, crawledRepo.Name, since)) { if (crawledRepo.Issues.TryGetValue(pullRequest.Number, out var issue)) { UpdateIssue(issue, pullRequest); } // TODO: Get PR reviews // TODO: Get PR commits // TODO: Get PR status } await crawledRepo.SaveAsync(repoPath); if (uploadToAzure) { Console.WriteLine($"Uploading {blobName} to Azure..."); var repoClient = new BlobClient(connectionString, cacheContainerName, blobName); await repoClient.UploadAsync(repoPath, overwrite : true); if (since is null) { var eventsToBeDeleted = events.Where(e => string.Equals($"{e.Org}/{e.Repo}", crawledRepo.FullName, StringComparison.OrdinalIgnoreCase)) .ToArray(); Console.WriteLine($"Deleting {eventsToBeDeleted.Length:N0} events for {crawledRepo.FullName}..."); foreach (var e in eventsToBeDeleted) { await eventStore.DeleteAsync(e); } } } } } foreach (var repo in repos) { var milestones = repo.Milestones.ToHashSet(); var labels = repo.Labels.ToHashSet(); foreach (var issue in repo.Issues.Values) { foreach (var label in issue.Labels.Where(l => !labels.Contains(l))) { Console.Error.WriteLine($"error: {repo.FullName}#{issue.Number}: label '{label.Name}' doesn't exist"); } if (issue.Milestone is not null && !milestones.Contains(issue.Milestone)) { Console.Error.WriteLine($"error: {repo.FullName}#{issue.Number}: milestone '{issue.Milestone.Title}' doesn't exist"); } } } Console.WriteLine("Creating trie..."); var trie = new CrawledTrie <CrawledIssue>(); foreach (var repo in repos) { foreach (var issue in repo.Issues.Values) { trie.Add(issue); } } Console.WriteLine("Creating index..."); var index = new CrawledIndex() { Repos = repos.ToList(), Trie = trie }; var indexName = "index.cicache"; var indexPath = string.IsNullOrEmpty(outputPath) ? Path.Join(tempDirectory, indexName) : outputPath; await index.SaveAsync(indexPath); if (uploadToAzure) { Console.WriteLine("Uploading index to Azure..."); var indexClient = new BlobClient(connectionString, "index", indexName); await indexClient.UploadAsync(indexPath, overwrite : true); } Console.WriteLine("Deleting temp files..."); Directory.Delete(tempDirectory, recursive: true); }