private static void CreateDatabase(Args arguments) { // and a couple of checks here Contract.Requires(arguments.OutputDirectory != null, "You must specify an output directory"); Contract.Requires(arguments.InputDirectory != null, "You must specify an input directory"); Contract.Requires(Directory.Exists(arguments.OutputDirectory), "The output directory must exist"); Contract.Requires(Directory.Exists(arguments.InputDirectory), "The input directory must exist"); // create the pipeline. The first step here is to parse the input files, and we can do this in parallel var buildArtifactParsingBlock = new TransformBlock <ParseBuildArtifactsInput, TimedActionResult <ParseBuildArtifactsOutput> >(i => { var action = new ParseBuildArtifacts(); return(action.PerformAction(i)); }, new ExecutionDataflowBlockOptions() { MaxDegreeOfParallelism = arguments.AppConfig.ConcurrencyConfig.MaxBuildParsingTasks } ); // the second is to save artifacts in a central folder var storeArtifactBlock = new ActionBlock <TimedActionResult <ParseBuildArtifactsOutput> >(i => { // the exception will be logged even if we dont do it here if (i.ExecutionStatus) { var action = new StoreBuildArtifacts(arguments.OutputDirectory); action.PerformAction(i.Result); } }, new ExecutionDataflowBlockOptions() { MaxDegreeOfParallelism = arguments.AppConfig.ConcurrencyConfig.MaxArtifactStoreTasks } ); // link them var numParsingTasks = 0; buildArtifactParsingBlock.LinkTo(storeArtifactBlock, new DataflowLinkOptions { PropagateCompletion = true }); // do now we can post to the initial queue foreach (var file in Directory.EnumerateFiles(arguments.InputDirectory, "*.json")) { buildArtifactParsingBlock.Post(new ParseBuildArtifactsInput(file)); ++numParsingTasks; } s_logger.Info($"Posted {numParsingTasks} parsing tasks, processing"); // now wait buildArtifactParsingBlock.Complete(); storeArtifactBlock.Completion.Wait(); // done }
private static void ClassifyInstances(Args arguments) { // and a couple of checks here Contract.Requires(arguments.OutputDirectory != null, "You must specify an output directory"); Contract.Requires(arguments.InputDirectory != null, "You must specify an input directory"); Contract.Requires(Directory.Exists(arguments.OutputDirectory), "The output directory must exist"); Contract.Requires(Directory.Exists(arguments.InputDirectory), "The input directory must exist"); var sharedCount = 0; var nonSharedCount = 0; try { // load the classifier first s_logger.Info("Loading classifier..."); // work on the content store s_logger.Info($"Initializing store at [{arguments.OutputDirectory}]"); var store = new RocksDbContentPlacementPredictionStore(arguments.OutputDirectory, true); var opContext = new OperationContext(new Context(new LogWrapper(s_logger))); // init it var initialized = store.StartupAsync(opContext); initialized.Wait(); // and check if (!initialized.Result) { s_logger.Error($"Could not initialize RocksDbContentPlacementPredictionStore at [{arguments.OutputDirectory}]"); } var classifier = new ContentPlacementClassifier(arguments.AppConfig.ClassifierConfiguration); // create the pipeline. The first step here is to parse the input files, and we can do this in parallel var buildArtifactParsingBlock = new TransformManyBlock <ParseBuildArtifactsInput, KeyValuePair <string, IReadOnlyList <ArtifactWithBuildMeta> > >(i => { var action = new ParseBuildArtifacts(); var result = action.PerformAction(i); if (result.ExecutionStatus) { return(result.Result.ArtifactsByHash.ToList()); } else { s_logger.Error(result.Exception, $"Error when parsing [{i.BuildArtifactsFile}]"); throw result.Exception; } }, new ExecutionDataflowBlockOptions() { MaxDegreeOfParallelism = arguments.AppConfig.ConcurrencyConfig.MaxBuildParsingTasks } ); // then, when we have one, we linearize it var linearizeBlock = new TransformBlock <KeyValuePair <string, IReadOnlyList <ArtifactWithBuildMeta> >, TimedActionResult <LinearizeArtifactsOutput> >(i => { var action = new LinearizeArtifacts(); return(action.PerformAction(new LinearizeArtifactsInput(i.Key, i.Value))); }, new ExecutionDataflowBlockOptions() { MaxDegreeOfParallelism = arguments.AppConfig.ConcurrencyConfig.MaxArtifactLinearizationTasks } ); // and we classify them var classifyBlock = new ActionBlock <TimedActionResult <LinearizeArtifactsOutput> >(i => { // i have an ml instance here if (i.ExecutionStatus) { var cpInstance = new ContentPlacementInstance() { Artifact = i.Result.Linear.AsInstance(), // using the default utility method QueueName = i.Result.Linear.Queues.First() // the first here is enough, since its always one! }; var result = classifier.Classify(cpInstance); if (result.Succeeded) { var selectedMachines = result.Value; foreach (var path in i.Result.Linear.ReportedPaths) { store.StoreResult(opContext, path, selectedMachines); Interlocked.Add(ref sharedCount, 1); } } else { Interlocked.Add(ref nonSharedCount, 1); } } }, new ExecutionDataflowBlockOptions() { MaxDegreeOfParallelism = arguments.AppConfig.ConcurrencyConfig.MaxArtifactClassificationTasks } ); // link them var numParsingTasks = 0; buildArtifactParsingBlock.LinkTo(linearizeBlock, new DataflowLinkOptions { PropagateCompletion = true }); linearizeBlock.LinkTo(classifyBlock, new DataflowLinkOptions { PropagateCompletion = true }); // do now we can post to the initial queue foreach (var file in Directory.EnumerateFiles(arguments.InputDirectory, "*.json")) { buildArtifactParsingBlock.Post(new ParseBuildArtifactsInput(file)); ++numParsingTasks; } s_logger.Info($"Posted {numParsingTasks} parsing tasks, processing"); // now wait buildArtifactParsingBlock.Complete(); classifyBlock.Completion.Wait(); // and now we should snapshot var snapshotDir = Path.Combine(arguments.OutputDirectory, "Snap"); Directory.CreateDirectory(snapshotDir); s_logger.Info($"Done, snapshoting to [{snapshotDir}]"); var result = store.CreateSnapshot(opContext, snapshotDir); // done } finally { var total = 1.0 * (sharedCount + nonSharedCount); var percentage = (1.0 * sharedCount) / total; s_logger.Info($"Stats: shared={sharedCount} ({percentage}), nonShared={nonSharedCount}, total={total}"); } }