Пример #1
0
        private async Task LoadEventsAsync()
        {
            try
            {
                foreach (var name in await _store.ListAsync())
                {
                    var payload = await _store.LoadAsync(name);

                    var headers = payload.Headers.ToDictionary(kv => kv.Key, kv => new StringValues(kv.Value.ToArray()));
                    var body    = payload.Body;
                    var message = GitHubEventMessage.Parse(headers, body);
                    _processingService.Enqueue(message);
                }
            }
            catch (Exception ex)
            {
                _logger.LogError(ex, "Couldn't load stored events");
            }
        }
Пример #2
0
        private static async Task RunAsync(CrawledSubscriptionList subscriptionList, bool reindex, bool pullLatest, bool randomReindex, bool uploadToAzure, string startingRepoName, string outputPath)
        {
            var reindexIntervalInDays = 28;
            var today = DateTime.Today;

            var connectionString = GetAzureStorageConnectionString();

            // TODO: We should avoid having to use a temp directory

            var tempDirectory = Path.Combine(Path.GetTempPath(), "ghcrawler");

            if (Directory.Exists(tempDirectory))
            {
                Directory.Delete(tempDirectory, recursive: true);
            }

            Directory.CreateDirectory(tempDirectory);

            var cacheContainerName   = "cache";
            var cacheContainerClient = new BlobContainerClient(connectionString, cacheContainerName);

            if (!reindex || startingRepoName is not null)
            {
                var startingBlobName    = $"{startingRepoName}.crcache";
                var reachedStartingBlob = false;

                await foreach (var blob in cacheContainerClient.GetBlobsAsync())
                {
                    if (!subscriptionList.Contains(blob.Name.Replace(".crcache", "")))
                    {
                        continue;
                    }

                    if (blob.Name == startingBlobName)
                    {
                        reachedStartingBlob = true;
                    }

                    if (reachedStartingBlob)
                    {
                        continue;
                    }

                    Console.WriteLine($"Downloading {blob.Name}...");

                    var localPath      = Path.Combine(tempDirectory, blob.Name);
                    var localDirectory = Path.GetDirectoryName(localPath);
                    Directory.CreateDirectory(localDirectory);

                    var blobClient = new BlobClient(connectionString, cacheContainerName, blob.Name);
                    await blobClient.DownloadToAsync(localPath);
                }
            }

            var factory = CreateGitHubClientFactory();
            var client  = await factory.CreateAsync();

            var jsonOptions = new JsonSerializerOptions()
            {
                WriteIndented = true
            };

            var repos = new List <CrawledRepo>();

            var reachedStartingRepo = reindex && startingRepoName is null;

            foreach (var org in subscriptionList.Orgs)
            {
                var orgDirectory = Path.Join(tempDirectory, org);
                Directory.CreateDirectory(orgDirectory);

                var existingRepos = Directory.GetFiles(orgDirectory, "*.crcache")
                                    .Select(p => Path.GetFileNameWithoutExtension(p));

                if (!pullLatest)
                {
                    Console.WriteLine($"Loading repos for {org}...");

                    foreach (var repoName in existingRepos)
                    {
                        var blobName = $"{repoName}.crcache";
                        var repoPath = Path.Join(orgDirectory, blobName);
                        var repo     = await CrawledRepo.LoadAsync(repoPath);

                        if (repo is not null)
                        {
                            repos.Add(repo);
                        }
                    }
                }
                else
                {
                    Console.WriteLine($"Requesting repos for {org}...");
                    var availableRepos = await RequestReposAsync(factory, client, org);

                    var deletedRepos = existingRepos.ToHashSet(StringComparer.OrdinalIgnoreCase);
                    deletedRepos.ExceptWith(availableRepos.Select(r => r.Name));

                    foreach (var deletedRepo in deletedRepos)
                    {
                        var blobName = $"{org}/{deletedRepo}.crcache";
                        var repoPath = Path.Join(tempDirectory, blobName);

                        Console.WriteLine($"Deleting local file {blobName}...");
                        File.Delete(repoPath);

                        if (uploadToAzure)
                        {
                            Console.WriteLine($"Deleting Azure blob {blobName}...");
                            await cacheContainerClient.DeleteBlobAsync(blobName);
                        }
                    }

                    foreach (var repo in availableRepos)
                    {
                        if (!subscriptionList.Contains(org, repo.Name))
                        {
                            continue;
                        }

                        var blobName = $"{org}/{repo.Name}.crcache";
                        var repoPath = Path.Join(tempDirectory, blobName);

                        if (string.Equals($"{org}/{repo.Name}", startingRepoName, StringComparison.OrdinalIgnoreCase))
                        {
                            reachedStartingRepo = true;
                        }

                        CrawledRepo crawledRepo;
                        try
                        {
                            crawledRepo = await CrawledRepo.LoadAsync(repoPath);
                        }
                        catch (JsonException)
                        {
                            Console.WriteLine($"WARNING: Couldn't parse {blobName}");
                            crawledRepo = null;
                        }

                        if (crawledRepo is null)
                        {
                            crawledRepo = new CrawledRepo
                            {
                                Id   = repo.Id,
                                Org  = org,
                                Name = repo.Name
                            };
                        }

                        crawledRepo.IsArchived = repo.Archived;
                        crawledRepo.Size       = repo.Size;

                        repos.Add(crawledRepo);

                        var repoIsDueForReindexing = crawledRepo.LastReindex is null ||
                                                     crawledRepo.LastReindex?.AddDays(reindexIntervalInDays) <= today;

                        if (reachedStartingRepo)
                        {
                            Console.WriteLine($"Marking {repo.FullName} to be re-indexed because we reached the starting repo {startingRepoName}.");
                        }

                        if (repoIsDueForReindexing)
                        {
                            if (crawledRepo.LastReindex is null)
                            {
                                Console.WriteLine($"Marking {repo.FullName} to be re-indexed because it was never fully indexed.");
                            }
                            else
                            {
                                Console.WriteLine($"Marking {repo.FullName} to be re-indexed because it was more than {reindexIntervalInDays} days ago, on {crawledRepo.LastReindex}.");
                            }
                        }

                        if (reachedStartingRepo || repoIsDueForReindexing)
                        {
                            crawledRepo.Clear();
                        }
                    }
                }
            }

            // We want to ensure that all repos are fully-reindexed at least once every four weeks.
            // That means we need to reindex at least #Repos / 28 per day.
            //
            // On top of that, we need to ensure that all repos which were never fully indexed (e.g.
            // they are new or were forced to be reindexed) are also reindexed.

            if (randomReindex)
            {
                var reposThatNeedReindexing = repos.Where(r => r.LastReindex is null).ToHashSet();

                var minimumNumberOfReposToBeReindexed = (int)Math.Ceiling(repos.Count / (float)reindexIntervalInDays);
                var numberOfReposThatNeedReindexing   = reposThatNeedReindexing.Count;

                if (numberOfReposThatNeedReindexing < minimumNumberOfReposToBeReindexed)
                {
                    // OK, there are fewer repos that need reindexing than what we want to reindex
                    // per day. So let's randomly pick some repos to reindex.

                    var remainingRepos = repos.Except(reposThatNeedReindexing).ToList();
                    var choiceCount    = minimumNumberOfReposToBeReindexed - numberOfReposThatNeedReindexing;

                    var random = new Random();

                    for (var choice = 0; choice < choiceCount; choice++)
                    {
                        var i    = random.Next(0, remainingRepos.Count);
                        var repo = remainingRepos[i];

                        Console.WriteLine($"Marking {repo.FullName} to be re-indexed because it was randomly chosen.");

                        repo.Clear();
                        reposThatNeedReindexing.Add(repo);
                        remainingRepos.RemoveAt(i);
                    }
                }
            }

            if (pullLatest)
            {
                Console.WriteLine($"Listing events...");

                var eventStore = new GitHubEventStore(connectionString);
                var events     = await eventStore.ListAsync();

                Console.WriteLine($"Crawling {repos.Count:N0} repos, fully reindexing {repos.Count(r => r.LastReindex is null):N0} repos...");

                foreach (var crawledRepo in repos)
                {
                    var blobName = $"{crawledRepo.FullName}.crcache";
                    var repoPath = Path.Join(tempDirectory, blobName);
                    var since    = crawledRepo.IncrementalUpdateStart;

                    var messages = new List <GitHubEventMessage>();

                    if (since is null)
                    {
                        Console.WriteLine($"Crawling {crawledRepo.FullName}...");
                    }
                    else
                    {
                        var toBeDownloaded = events.Where(n => string.Equals(n.Org, crawledRepo.Org, StringComparison.OrdinalIgnoreCase) &&
                                                          string.Equals(n.Repo, crawledRepo.Name, StringComparison.OrdinalIgnoreCase))
                                             .ToArray();

                        if (toBeDownloaded.Any())
                        {
                            Console.WriteLine($"Loading {toBeDownloaded.Length:N0} events for {crawledRepo.FullName}...");

                            var i           = 0;
                            var lastPercent = 0;

                            foreach (var name in toBeDownloaded)
                            {
                                var percent = (int)Math.Ceiling((float)i / toBeDownloaded.Length * 100);
                                i++;
                                if (percent % 10 == 0)
                                {
                                    if (percent != lastPercent)
                                    {
                                        Console.Write($"{percent}%...");
                                    }

                                    lastPercent = percent;
                                }

                                var payload = await eventStore.LoadAsync(name);

                                var headers = payload.Headers.ToDictionary(kv => kv.Key, kv => new StringValues(kv.Value.ToArray()));
                                var body    = payload.Body;
                                var message = GitHubEventMessage.Parse(headers, body);
                                messages.Add(message);
                            }

                            Console.WriteLine("done.");
                        }

                        Console.WriteLine($"Crawling {crawledRepo.FullName} since {since}...");
                    }

                    if (crawledRepo.LastReindex is null)
                    {
                        crawledRepo.LastReindex = DateTimeOffset.UtcNow;
                    }

                    crawledRepo.AreaOwners = await GetAreaOwnersAsync(crawledRepo.Org, crawledRepo.Name);

                    var currentLabels = await RequestLabelsAsync(factory, client, crawledRepo.Org, crawledRepo.Name);

                    SyncLabels(crawledRepo, currentLabels, out var labelById);

                    var currentMilestones = await RequestMilestonesAsync(factory, client, crawledRepo.Org, crawledRepo.Name);

                    SyncMilestones(crawledRepo, currentMilestones, out var milestoneById);

                    // NOTE: GitHub's Issues.GetAllForeRepository() doesn't include issues that were transferred
                    //
                    // That's the good part. The bad part is that for the new repository where
                    // it shows up, we have no way of knowing which repo it came from and which
                    // number it used to have (even when looking at the issues timeline data),
                    // so we can't remove the issue from the source repo.
                    //
                    // However, since we're persisting GitHub events we received, we'll can look
                    // up which issues were transferred and remove them from the repo. This avoids
                    // having to wait until we fully reindex the repo.
                    //
                    // Note, we remove transferred issues before pulling issues in case the issues
                    // were being transferred back; it seems GitHub is reusing the numbers in that
                    // case.

                    foreach (var message in messages.Where(m => m.Body.Action == "transferred"))
                    {
                        Console.WriteLine($"Removing {message.Body?.Repository?.FullName}#{message.Body?.Issue?.Number}: {message.Body?.Issue?.Title}");

                        var number = message.Body?.Issue?.Number;
                        if (number is not null)
                        {
                            crawledRepo.Issues.Remove(number.Value);
                        }
                    }

                    foreach (var issue in await RequestIssuesAsync(factory, client, crawledRepo.Org, crawledRepo.Name, since))
                    {
                        var crawledIssue = ConvertIssue(crawledRepo, issue, labelById, milestoneById);
                        crawledRepo.Issues[issue.Number] = crawledIssue;
                    }

                    foreach (var pullRequest in await RequestPullRequestsAsync(factory, client, crawledRepo.Org, crawledRepo.Name, since))
                    {
                        if (crawledRepo.Issues.TryGetValue(pullRequest.Number, out var issue))
                        {
                            UpdateIssue(issue, pullRequest);
                        }

                        // TODO: Get PR reviews
                        // TODO: Get PR commits
                        // TODO: Get PR status
                    }

                    await crawledRepo.SaveAsync(repoPath);

                    if (uploadToAzure)
                    {
                        Console.WriteLine($"Uploading {blobName} to Azure...");
                        var repoClient = new BlobClient(connectionString, cacheContainerName, blobName);
                        await repoClient.UploadAsync(repoPath, overwrite : true);

                        if (since is null)
                        {
                            var eventsToBeDeleted = events.Where(e => string.Equals($"{e.Org}/{e.Repo}", crawledRepo.FullName, StringComparison.OrdinalIgnoreCase))
                                                    .ToArray();

                            Console.WriteLine($"Deleting {eventsToBeDeleted.Length:N0} events for {crawledRepo.FullName}...");
                            foreach (var e in eventsToBeDeleted)
                            {
                                await eventStore.DeleteAsync(e);
                            }
                        }
                    }
                }
            }

            foreach (var repo in repos)
            {
                var milestones = repo.Milestones.ToHashSet();
                var labels     = repo.Labels.ToHashSet();

                foreach (var issue in repo.Issues.Values)
                {
                    foreach (var label in issue.Labels.Where(l => !labels.Contains(l)))
                    {
                        Console.Error.WriteLine($"error: {repo.FullName}#{issue.Number}: label '{label.Name}' doesn't exist");
                    }

                    if (issue.Milestone is not null && !milestones.Contains(issue.Milestone))
                    {
                        Console.Error.WriteLine($"error: {repo.FullName}#{issue.Number}: milestone '{issue.Milestone.Title}' doesn't exist");
                    }
                }
            }

            Console.WriteLine("Creating trie...");

            var trie = new CrawledTrie <CrawledIssue>();

            foreach (var repo in repos)
            {
                foreach (var issue in repo.Issues.Values)
                {
                    trie.Add(issue);
                }
            }

            Console.WriteLine("Creating index...");

            var index = new CrawledIndex()
            {
                Repos = repos.ToList(),
                Trie  = trie
            };

            var indexName = "index.cicache";
            var indexPath = string.IsNullOrEmpty(outputPath)
                                ? Path.Join(tempDirectory, indexName)
                                : outputPath;

            await index.SaveAsync(indexPath);

            if (uploadToAzure)
            {
                Console.WriteLine("Uploading index to Azure...");

                var indexClient = new BlobClient(connectionString, "index", indexName);
                await indexClient.UploadAsync(indexPath, overwrite : true);
            }

            Console.WriteLine("Deleting temp files...");

            Directory.Delete(tempDirectory, recursive: true);
        }