static async Task OutputWorker(UsageDatabase database, IdMap <Guid> apiMap, IdMap <PackageIdentity> packageMap, BlockingCollection <PackageResults> queue) { try { using var usageWriter = database.CreateUsageWriter(); foreach (var(packageIdentity, logLines, apis) in queue.GetConsumingEnumerable()) { foreach (var line in logLines) { Console.WriteLine($"[Crawler] {line}"); } foreach (var api in apis) { if (!apiMap.Contains(api)) { continue; } var packageId = packageMap.GetId(packageIdentity); var apiId = apiMap.GetId(api); await usageWriter.WriteAsync(packageId, apiId); } } await usageWriter.SaveAsync(); Console.WriteLine("Output Worker has finished."); } catch (Exception ex) { Console.WriteLine("[Fatal] Output Worker crashed: " + ex); Environment.Exit(1); } }
private static async Task CrawlAsync(PackageListCrawler packageListCrawler, CrawlerStore crawlerStore) { var apiCatalogPath = GetScratchFilePath("apicatalog.dat"); var databasePath = GetScratchFilePath("usage.db"); var usagesPath = GetScratchFilePath("usages.tsv"); Console.WriteLine("Downloading API catalog..."); await crawlerStore.DownloadApiCatalogAsync(apiCatalogPath); Console.WriteLine("Loading API catalog..."); var apiCatalog = await ApiCatalogModel.LoadAsync(apiCatalogPath); Console.WriteLine("Downloading previously indexed usages..."); await crawlerStore.DownloadDatabaseAsync(databasePath); using var usageDatabase = await UsageDatabase.OpenOrCreateAsync(databasePath); Console.WriteLine("Creating temporary indexes..."); await usageDatabase.CreateTempIndexesAsync(); Console.WriteLine("Discovering existing APIs..."); var apiMap = await usageDatabase.ReadApisAsync(); Console.WriteLine("Discovering existing packages..."); var packageIdMap = await usageDatabase.ReadPackagesAsync(); Console.WriteLine("Discovering latest packages..."); var stopwatch = Stopwatch.StartNew(); var packages = await packageListCrawler.GetPackagesAsync(); Console.WriteLine($"Finished package discovery. Took {stopwatch.Elapsed}"); Console.WriteLine($"Found {packages.Count:N0} package(s) in total."); packages = CollapseToLatestStableAndLatestPreview(packages); Console.WriteLine($"Found {packages.Count:N0} package(s) after collapsing to latest stable & latest preview."); var indexedPackages = new HashSet <PackageIdentity>(packageIdMap.Values); var currentPackages = new HashSet <PackageIdentity>(packages); var packagesToBeDeleted = indexedPackages.Where(p => !currentPackages.Contains(p)).ToArray(); var packagesToBeIndexed = currentPackages.Where(p => !indexedPackages.Contains(p)).ToArray(); Console.WriteLine($"Found {indexedPackages.Count:N0} package(s) in the index."); Console.WriteLine($"Found {packagesToBeDeleted.Length:N0} package(s) to remove from the index."); Console.WriteLine($"Found {packagesToBeIndexed.Length:N0} package(s) to add to the index."); Console.WriteLine($"Deleting packages..."); stopwatch.Restart(); await usageDatabase.DeletePackagesAsync(packagesToBeDeleted.Select(p => packageIdMap.GetId(p))); Console.WriteLine($"Finished deleting packages. Took {stopwatch.Elapsed}"); Console.WriteLine($"Inserting new packages..."); stopwatch.Restart(); using (var packageWriter = usageDatabase.CreatePackageWriter()) { foreach (var packageIdentity in packagesToBeIndexed) { var packageId = packageIdMap.Add(packageIdentity); await packageWriter.WriteAsync(packageId, packageIdentity); } await packageWriter.SaveAsync(); } Console.WriteLine($"Finished inserting new packages. Took {stopwatch.Elapsed}"); stopwatch.Restart(); var numberOfWorkers = Environment.ProcessorCount; Console.WriteLine($"Crawling using {numberOfWorkers} workers."); var inputQueue = new ConcurrentQueue <PackageIdentity>(packagesToBeIndexed); var outputQueue = new BlockingCollection <PackageResults>(); var workers = Enumerable.Range(0, numberOfWorkers) .Select(i => Task.Run(() => CrawlWorker(i, inputQueue, outputQueue))) .ToArray(); var outputWorker = Task.Run(() => OutputWorker(usageDatabase, apiMap, packageIdMap, outputQueue)); await Task.WhenAll(workers); outputQueue.CompleteAdding(); await outputWorker; Console.WriteLine($"Finished crawling. Took {stopwatch.Elapsed}"); Console.WriteLine("Inserting missing APIs..."); stopwatch.Restart(); await usageDatabase.InsertMissingApisAsync(apiMap); Console.WriteLine($"Finished inserting missing APIs. Took {stopwatch.Elapsed}"); Console.WriteLine($"Aggregating results..."); stopwatch.Restart(); var ancestors = apiCatalog.GetAllApis() .SelectMany(a => a.AncestorsAndSelf(), (api, ancestor) => (api.Guid, ancestor.Guid)); await usageDatabase.ExportUsagesAsync(apiMap, ancestors, usagesPath); Console.WriteLine($"Finished aggregating results. Took {stopwatch.Elapsed}"); Console.WriteLine($"Vacuuming database..."); stopwatch.Restart(); await usageDatabase.VacuumAsync(); Console.WriteLine($"Finished vacuuming database. Took {stopwatch.Elapsed}"); usageDatabase.Dispose(); Console.WriteLine($"Uploading usages..."); await crawlerStore.UploadResultsAsync(usagesPath); Console.WriteLine($"Uploading database..."); await crawlerStore.UploadDatabaseAsync(databasePath);