/// <summary> /// Stages the files required to create the ardb and run the merger to produce the ardb. /// </summary> /// <param name="packagesToInclude">The list of packages to include in the ardb.</param> /// <param name="indexerVersion">The version of the indexer used to create the idx file.</param> /// <param name="mergerVersion">The version of the merger to create the ardb file.</param> /// <param name="outputDirectory">The working directory.</param> void CreateArdbFile(IList<Tuple<RegistrationIndexPackage, long>> packagesToInclude, Version indexerVersion, Version mergerVersion, string outputDirectory) { using (ActivityTimer timer = new ActivityTimer("CreateArdbFile")) { try { // Set up the directory structure. string idxDirectory = Path.Combine(outputDirectory, "idx"); string logsDirectory = Path.Combine(outputDirectory, "logs"); Directory.CreateDirectory(outputDirectory); Directory.CreateDirectory(idxDirectory); Directory.CreateDirectory(logsDirectory); // Stage the files and run the merger. IEnumerable<string> idxList = StageIdxFiles(packagesToInclude, this._storage, indexerVersion, idxDirectory); string ardbTextFile = RunArdbMerger(mergerVersion, idxDirectory, outputDirectory); // Save the ardb/txt file. string version = DateTime.UtcNow.ToString("yyyyMMdd"); Uri ardbResourceUri = this._storage.ComposeArdbResourceUrl(mergerVersion, $"{version}\\{version}.ardb.txt"); this._storage.SaveFileContents(ardbTextFile, ardbResourceUri); SarifTraceListener.TraceInformation($"Saved ardb/txt file to {ardbResourceUri}."); // Save the ardb/txt file to latest.txt. This is the file consumed by the publisher. Uri latestResourceUri = this._storage.ComposeArdbResourceUrl(mergerVersion, $"latest\\latest.txt"); this._storage.SaveFileContents(ardbTextFile, latestResourceUri); SarifTraceListener.TraceInformation($"Saved ardb/txt file to {latestResourceUri}."); } finally { try { Directory.Delete(outputDirectory, true); } catch (Exception e) { SarifTraceListener.TraceWarning("NG009", $"Could not delete the temp directory {outputDirectory}.", e); } } } }
/// <summary> /// Filters the package list to only the latest stable version of the packages /// </summary> /// <param name="packageDownloadCounts"></param> /// <param name="downloadCountThreshold"></param> /// <returns></returns> IList<Tuple<RegistrationIndexPackage, long>> FilterPackagesToInclude(Dictionary<string, long> packageDownloadCounts, long downloadCountThreshold) { using (ActivityTimer timer = new ActivityTimer("FilterPackagesToInclude")) { // In general, here's how the filtering works. // 1. Calcuate the total download count for all the NuGet packages. (this is done before calling this method.) // 2. Calculate how many downloads we want to include in the ardb. (this is done before calling this method.) // 3. Exclude any packages which do not have a latest stable version. // 4. Sort the packages based on download count. // 5. Include the most popular packages until the target download count is reached. // 6. Sort the included packages, first by log2(downloadcount), then by package name. // This sort order ensures the most popular packages are recommended first, while // minimizing the day-to-day differences. // The runninng count of the downloads included in the ardb. long includedDownloadCount = 0; // A flag to signal that the download threshold was reached. bool thresholdReached = false; List<Tuple<RegistrationIndexPackage, long>> includedPackagesWithCounts = new List<Tuple<RegistrationIndexPackage, long>>(); // We need to determine the latest stable version for each package. But this is a fairly expensive operation since // it makes a network call. We'll process the packages in chuncks so we can get the latest stable version of // the packages using multiple threads. int batchSize = Catalog2ElfieOptions.FilterPackagesToIncludeBatchSize; int currentPosition = 0; // We need to process the packages in order from most popular to least popular. (item.Value is the download count for the package) var orderedDownloadCounts = packageDownloadCounts.OrderByDescending(item => item.Value); IEnumerable<string> batch = null; do { // Get the next chunk of packages to process. batch = orderedDownloadCounts.Skip(currentPosition).Take(batchSize).Select(item => item.Key); // Get the latest versions of the packages. Dictionary<string, RegistrationIndexPackage> latestVersions = GetLatestVersion(batch); foreach (string packageId in batch) { long downloadCount = packageDownloadCounts[packageId]; RegistrationIndexPackage latestStableVersion; // If there's a latest stable version for the package, we want to include it. if (latestVersions.TryGetValue(packageId, out latestStableVersion)) { Trace.TraceInformation($"Included package: {packageId} - {downloadCount.ToString("#,####")}"); includedDownloadCount += downloadCount; includedPackagesWithCounts.Add(Tuple.Create((RegistrationIndexPackage)latestStableVersion, downloadCount)); } else { // There wasn't a latest stable version of this package. // Reduce the threshold by this package's download count since it shouldn't be counted. downloadCountThreshold -= (long)(downloadCount * this._downloadPercentage); } // Stop if we've reached the download threhold. Trace.TraceInformation($"Download count {includedDownloadCount.ToString("#,####")} / {downloadCountThreshold.ToString("#,####")}"); thresholdReached = (includedDownloadCount >= downloadCountThreshold); if (thresholdReached) { break; } } Trace.TraceInformation($"Current package count {includedPackagesWithCounts.Count.ToString("#,###")}."); currentPosition += batchSize; } while (!thresholdReached && batch != null && batch.Count() > 0); SarifTraceListener.TraceInformation($"Including {includedPackagesWithCounts.Count.ToString("#,###")} packages."); SarifTraceListener.TraceInformation("NG911", includedPackagesWithCounts.Count.ToString()); // Basic validation, just check that the package counts are about the right number. int minimumPackageCount = Catalog2ElfieOptions.MinimumPackageCountAfterFiltering; Trace.TraceInformation($"Verify filtered package count {includedPackagesWithCounts.Count} > {minimumPackageCount}."); if (includedPackagesWithCounts.Count < minimumPackageCount) { throw new InvalidOperationException($"The filtered package count is less than the minimum set of filtered packages. {includedPackagesWithCounts.Count} < {minimumPackageCount}"); } return includedPackagesWithCounts; } }
/// <summary> /// Downloads the json file which contains the package download counts. /// </summary> /// <param name="downloadJsonUri">The url to the file which contains the download counts.</param> /// <returns>A JArray representing the downloaded file.</returns> /// <remarks>The file downloaded is a json array, not a json file. i.e. it is not enclosed in { }.</remarks> JArray FetchDownloadCounts(Uri downloadJsonUri) { using (ActivityTimer timer = new ActivityTimer("FetchDownloadCounts")) { JArray downloadJson; using (WebClient webClient = new WebClient()) { string downloadText = webClient.DownloadString(downloadJsonUri); downloadJson = JArray.Parse(downloadText); } SarifTraceListener.TraceInformation($"Total packages in download json: {downloadJson.Count.ToString("#,###")}"); SarifTraceListener.TraceInformation("NG910", downloadJson.Count.ToString()); // Basic validation, just check that the package counts are about the right number. int minimumPackageCount = Catalog2ElfieOptions.MinimumPackageCountFromDownloadUrl; Trace.TraceInformation($"Verify download json file package count {downloadJson.Count} > {minimumPackageCount}."); if (downloadJson.Count < minimumPackageCount) { throw new InvalidOperationException($"The download count json file which was downloaded did not contain the minimum set of download data. {downloadJson.Count} < {minimumPackageCount}"); } return downloadJson; } }
/// <summary> /// Creates Elfie index, Idx and Ardb, files for NuGet packages. /// </summary> /// <returns>True if the the indexes are created. False if the indexes /// are not created, but the error is transient. If a unrecoverable error /// is encountered an exception is thrown.</returns> public async Task<bool> Run(CancellationToken cancellationToken) { using (ActivityTimer timer = new ActivityTimer("Run")) { try { // Load the download counts JArray downloadJson = FetchDownloadCounts(this._downloadCountsUri); // We need to get the list of packages from two different sources. The first source is NuGet // and the second source is from text files in the AssemblyPackages subdirectory. // NuGet // The NuGet packages are filtered so only the latest stable versions are included. // The NuGet packages are filtered so only the top XX% of downloads are included. // The NuGet packages are placed into groups based on their log2(downloadcount). Note: This is done in the elfie merger. // Local Packages // A fake package is created for each text file in the AssemblyPackages folder. These // fake packages allow us to include the .NET Framework assemblies in the index. // The fake packages are placed in the highest grouping. // Get the NuGet packages to include in the ardb index IList<Tuple<RegistrationIndexPackage, long>> packagesToInclude = GetPackagesToInclude(downloadJson, this._downloadPercentage); SarifTraceListener.TraceInformation($"Including {packagesToInclude.Count} potential NuGet packages."); // Get the list of local (framework) assembly packages to include in the ardb index. IList<Tuple<RegistrationIndexPackage, long>> localPackagesToInclude = GetLocalAssemblyPackages(Catalog2ElfieOptions.AssemblyPackagesDirectory); SarifTraceListener.TraceInformation($"Including {localPackagesToInclude.Count} potential local packages."); // Merge the two package lists. foreach (var assemblyPackage in localPackagesToInclude) { packagesToInclude.Add(assemblyPackage); } SarifTraceListener.TraceInformation($"Including {packagesToInclude.Count} total potential packages."); // Create the idx index for each package await CreateIdxIndexesAsync(packagesToInclude.Select(item => item.Item1), cancellationToken); // Create the ardb index string outputDirectory = Path.Combine(this._tempPath, Guid.NewGuid().ToString()); CreateArdbFile(packagesToInclude, this._indexerVersion, this._mergerVersion, outputDirectory); } catch (System.Net.WebException e) { System.Net.HttpWebResponse response = e.Response as System.Net.HttpWebResponse; if (response != null && response.StatusCode == System.Net.HttpStatusCode.BadGateway) { // If the response is a bad gateway, it's likely a transient error. Return false so we'll // sleep in Catalog2Elfie and try again after the interval elapses. return false; } else { // If it's any other error, rethrow the exception. This will stop the application so // the issue can be addressed. throw; } } return true; } }
/// <summary> /// Creates an idx file for each package. /// </summary> /// <param name="packages">The list of packages to process.</param> /// <remarks>If the package's idx file is already in storage, e.g. it was created in /// a previous run, we use the stored package. A new idx file is only created for new packages.</remarks> async Task CreateIdxIndexesAsync(IEnumerable<RegistrationIndexPackage> packages, CancellationToken cancellationToken) { using (ActivityTimer timer = new ActivityTimer("ProcessCatalogItems")) { ParallelOptions options = new ParallelOptions() { MaxDegreeOfParallelism = this._maxThreads, }; Parallel.ForEach(packages, options, package => { Trace.TraceInformation("Processing package {0}", package.CatalogEntry.PackageId); // Get the storage url for the idx file. We'll use this to check if the // idx file already exists before going through the effort of creating one. Uri idxResourceUri = this._storage.ComposeIdxResourceUrl(this._indexerVersion, package.CatalogEntry.PackageId, package.CatalogEntry.PackageVersion); StorageContent idxStorageItem = this._storage.Load(idxResourceUri, new CancellationToken()).Result; if (idxStorageItem == null) { // We didn't have the idx file in storage, so go through the process of downloading, // decompressing and creating the idx file. ProcessPackageDetailsAsync(package, cancellationToken).Wait(); } else { SarifTraceListener.TraceInformation($"Idx already exists in storage for package {package.CatalogEntry.PackageId}."); } }); } }