Example #1
0
        static async Task OutputWorker(UsageDatabase database,
                                       IdMap <Guid> apiMap,
                                       IdMap <PackageIdentity> packageMap,
                                       BlockingCollection <PackageResults> queue)
        {
            try
            {
                using var usageWriter = database.CreateUsageWriter();

                foreach (var(packageIdentity, logLines, apis) in queue.GetConsumingEnumerable())
                {
                    foreach (var line in logLines)
                    {
                        Console.WriteLine($"[Crawler] {line}");
                    }

                    foreach (var api in apis)
                    {
                        if (!apiMap.Contains(api))
                        {
                            continue;
                        }

                        var packageId = packageMap.GetId(packageIdentity);
                        var apiId     = apiMap.GetId(api);
                        await usageWriter.WriteAsync(packageId, apiId);
                    }
                }

                await usageWriter.SaveAsync();

                Console.WriteLine("Output Worker has finished.");
            }
            catch (Exception ex)
            {
                Console.WriteLine("[Fatal] Output Worker crashed: " + ex);
                Environment.Exit(1);
            }
        }
Example #2
0
    private static async Task CrawlAsync(PackageListCrawler packageListCrawler, CrawlerStore crawlerStore)
    {
        var apiCatalogPath = GetScratchFilePath("apicatalog.dat");
        var databasePath   = GetScratchFilePath("usage.db");
        var usagesPath     = GetScratchFilePath("usages.tsv");

        Console.WriteLine("Downloading API catalog...");

        await crawlerStore.DownloadApiCatalogAsync(apiCatalogPath);

        Console.WriteLine("Loading API catalog...");

        var apiCatalog = await ApiCatalogModel.LoadAsync(apiCatalogPath);

        Console.WriteLine("Downloading previously indexed usages...");

        await crawlerStore.DownloadDatabaseAsync(databasePath);

        using var usageDatabase = await UsageDatabase.OpenOrCreateAsync(databasePath);

        Console.WriteLine("Creating temporary indexes...");

        await usageDatabase.CreateTempIndexesAsync();

        Console.WriteLine("Discovering existing APIs...");

        var apiMap = await usageDatabase.ReadApisAsync();

        Console.WriteLine("Discovering existing packages...");

        var packageIdMap = await usageDatabase.ReadPackagesAsync();

        Console.WriteLine("Discovering latest packages...");

        var stopwatch = Stopwatch.StartNew();
        var packages  = await packageListCrawler.GetPackagesAsync();

        Console.WriteLine($"Finished package discovery. Took {stopwatch.Elapsed}");
        Console.WriteLine($"Found {packages.Count:N0} package(s) in total.");

        packages = CollapseToLatestStableAndLatestPreview(packages);

        Console.WriteLine($"Found {packages.Count:N0} package(s) after collapsing to latest stable & latest preview.");

        var indexedPackages = new HashSet <PackageIdentity>(packageIdMap.Values);
        var currentPackages = new HashSet <PackageIdentity>(packages);

        var packagesToBeDeleted = indexedPackages.Where(p => !currentPackages.Contains(p)).ToArray();
        var packagesToBeIndexed = currentPackages.Where(p => !indexedPackages.Contains(p)).ToArray();

        Console.WriteLine($"Found {indexedPackages.Count:N0} package(s) in the index.");
        Console.WriteLine($"Found {packagesToBeDeleted.Length:N0} package(s) to remove from the index.");
        Console.WriteLine($"Found {packagesToBeIndexed.Length:N0} package(s) to add to the index.");

        Console.WriteLine($"Deleting packages...");

        stopwatch.Restart();
        await usageDatabase.DeletePackagesAsync(packagesToBeDeleted.Select(p => packageIdMap.GetId(p)));

        Console.WriteLine($"Finished deleting packages. Took {stopwatch.Elapsed}");

        Console.WriteLine($"Inserting new packages...");

        stopwatch.Restart();

        using (var packageWriter = usageDatabase.CreatePackageWriter())
        {
            foreach (var packageIdentity in packagesToBeIndexed)
            {
                var packageId = packageIdMap.Add(packageIdentity);
                await packageWriter.WriteAsync(packageId, packageIdentity);
            }

            await packageWriter.SaveAsync();
        }

        Console.WriteLine($"Finished inserting new packages. Took {stopwatch.Elapsed}");

        stopwatch.Restart();

        var numberOfWorkers = Environment.ProcessorCount;

        Console.WriteLine($"Crawling using {numberOfWorkers} workers.");

        var inputQueue = new ConcurrentQueue <PackageIdentity>(packagesToBeIndexed);

        var outputQueue = new BlockingCollection <PackageResults>();

        var workers = Enumerable.Range(0, numberOfWorkers)
                      .Select(i => Task.Run(() => CrawlWorker(i, inputQueue, outputQueue)))
                      .ToArray();

        var outputWorker = Task.Run(() => OutputWorker(usageDatabase, apiMap, packageIdMap, outputQueue));

        await Task.WhenAll(workers);

        outputQueue.CompleteAdding();
        await outputWorker;

        Console.WriteLine($"Finished crawling. Took {stopwatch.Elapsed}");

        Console.WriteLine("Inserting missing APIs...");

        stopwatch.Restart();
        await usageDatabase.InsertMissingApisAsync(apiMap);

        Console.WriteLine($"Finished inserting missing APIs. Took {stopwatch.Elapsed}");

        Console.WriteLine($"Aggregating results...");

        stopwatch.Restart();

        var ancestors = apiCatalog.GetAllApis()
                        .SelectMany(a => a.AncestorsAndSelf(), (api, ancestor) => (api.Guid, ancestor.Guid));
        await usageDatabase.ExportUsagesAsync(apiMap, ancestors, usagesPath);

        Console.WriteLine($"Finished aggregating results. Took {stopwatch.Elapsed}");

        Console.WriteLine($"Vacuuming database...");

        stopwatch.Restart();
        await usageDatabase.VacuumAsync();

        Console.WriteLine($"Finished vacuuming database. Took {stopwatch.Elapsed}");

        usageDatabase.Dispose();

        Console.WriteLine($"Uploading usages...");

        await crawlerStore.UploadResultsAsync(usagesPath);

        Console.WriteLine($"Uploading database...");

        await crawlerStore.UploadDatabaseAsync(databasePath);