private static void ClassifyInstances(Args arguments) { // and a couple of checks here Contract.Requires(arguments.OutputDirectory != null, "You must specify an output directory"); Contract.Requires(arguments.InputDirectory != null, "You must specify an input directory"); Contract.Requires(Directory.Exists(arguments.OutputDirectory), "The output directory must exist"); Contract.Requires(Directory.Exists(arguments.InputDirectory), "The input directory must exist"); var sharedCount = 0; var nonSharedCount = 0; try { // load the classifier first s_logger.Info("Loading classifier..."); // work on the content store s_logger.Info($"Initializing store at [{arguments.OutputDirectory}]"); var store = new RocksDbContentPlacementPredictionStore(arguments.OutputDirectory, true); var opContext = new OperationContext(new Context(new LogWrapper(s_logger))); // init it var initialized = store.StartupAsync(opContext); initialized.Wait(); // and check if (!initialized.Result) { s_logger.Error($"Could not initialize RocksDbContentPlacementPredictionStore at [{arguments.OutputDirectory}]"); } var classifier = new ContentPlacementClassifier(arguments.AppConfig.ClassifierConfiguration); // create the pipeline. The first step here is to parse the input files, and we can do this in parallel var buildArtifactParsingBlock = new TransformManyBlock <ParseBuildArtifactsInput, KeyValuePair <string, IReadOnlyList <ArtifactWithBuildMeta> > >(i => { var action = new ParseBuildArtifacts(); var result = action.PerformAction(i); if (result.ExecutionStatus) { return(result.Result.ArtifactsByHash.ToList()); } else { s_logger.Error(result.Exception, $"Error when parsing [{i.BuildArtifactsFile}]"); throw result.Exception; } }, new ExecutionDataflowBlockOptions() { MaxDegreeOfParallelism = arguments.AppConfig.ConcurrencyConfig.MaxBuildParsingTasks } ); // then, when we have one, we linearize it var linearizeBlock = new TransformBlock <KeyValuePair <string, IReadOnlyList <ArtifactWithBuildMeta> >, TimedActionResult <LinearizeArtifactsOutput> >(i => { var action = new LinearizeArtifacts(); return(action.PerformAction(new LinearizeArtifactsInput(i.Key, i.Value))); }, new ExecutionDataflowBlockOptions() { MaxDegreeOfParallelism = arguments.AppConfig.ConcurrencyConfig.MaxArtifactLinearizationTasks } ); // and we classify them var classifyBlock = new ActionBlock <TimedActionResult <LinearizeArtifactsOutput> >(i => { // i have an ml instance here if (i.ExecutionStatus) { var cpInstance = new ContentPlacementInstance() { Artifact = i.Result.Linear.AsInstance(), // using the default utility method QueueName = i.Result.Linear.Queues.First() // the first here is enough, since its always one! }; var result = classifier.Classify(cpInstance); if (result.Succeeded) { var selectedMachines = result.Value; foreach (var path in i.Result.Linear.ReportedPaths) { store.StoreResult(opContext, path, selectedMachines); Interlocked.Add(ref sharedCount, 1); } } else { Interlocked.Add(ref nonSharedCount, 1); } } }, new ExecutionDataflowBlockOptions() { MaxDegreeOfParallelism = arguments.AppConfig.ConcurrencyConfig.MaxArtifactClassificationTasks } ); // link them var numParsingTasks = 0; buildArtifactParsingBlock.LinkTo(linearizeBlock, new DataflowLinkOptions { PropagateCompletion = true }); linearizeBlock.LinkTo(classifyBlock, new DataflowLinkOptions { PropagateCompletion = true }); // do now we can post to the initial queue foreach (var file in Directory.EnumerateFiles(arguments.InputDirectory, "*.json")) { buildArtifactParsingBlock.Post(new ParseBuildArtifactsInput(file)); ++numParsingTasks; } s_logger.Info($"Posted {numParsingTasks} parsing tasks, processing"); // now wait buildArtifactParsingBlock.Complete(); classifyBlock.Completion.Wait(); // and now we should snapshot var snapshotDir = Path.Combine(arguments.OutputDirectory, "Snap"); Directory.CreateDirectory(snapshotDir); s_logger.Info($"Done, snapshoting to [{snapshotDir}]"); var result = store.CreateSnapshot(opContext, snapshotDir); // done } finally { var total = 1.0 * (sharedCount + nonSharedCount); var percentage = (1.0 * sharedCount) / total; s_logger.Info($"Stats: shared={sharedCount} ({percentage}), nonShared={nonSharedCount}, total={total}"); } }
private static void EvaluateContentPlacementClassifier(Args arguments) { Contract.Requires(arguments.InputDirectory != null, "You must specify an input directory"); Contract.Requires(Directory.Exists(arguments.InputDirectory), "The input directory must exist"); var configurationFile = $"{Path.Combine(arguments.InputDirectory, "classifier.json")}"; s_logger.Info($"Evaluating classifier from [{configurationFile}]"); // approx memory consumption and check load time var initialMemory = GC.GetTotalMemory(true); var load = Stopwatch.StartNew(); var classifier = new ContentPlacementClassifier(configurationFile); load.Stop(); var consumedMemory = GC.GetTotalMemory(false) - initialMemory; s_logger.Info($"Classifier loaded in {load.ElapsedMilliseconds}ms, approxBytes={consumedMemory}"); var numInstances = 0; var random = new Random(Environment.TickCount); // read some queue names var qNames = new List <string>(); var instances = new Dictionary <ContentPlacementInstance, List <string> >(); var uniqueMachines = new HashSet <string>(); foreach (var qq in Directory.EnumerateFiles(Path.Combine(arguments.InputDirectory, "QueueMap"))) { qNames.Add(Path.GetFileNameWithoutExtension(qq)); ++numInstances; } // now test for some instances. Just some random instances, one per queue var ns = 0; var na = 0; var classify = Stopwatch.StartNew(); foreach (var queueName in qNames) { var instance = new ContentPlacementInstance() { QueueName = queueName, Artifact = new RandomForestInstance() { Attributes = new Dictionary <string, double>() { ["SizeBytes"] = random.Next(0, 1000000000), ["AvgInputPips"] = random.Next(0, 100000), ["AvgOutputPips"] = random.Next(0, 100000), ["AvgPositionForInputPips"] = random.NextDouble(), ["AvgPositionForOutputPips"] = random.NextDouble(), ["AvgDepsForInputPips"] = random.Next(0, 10000), ["AvgDepsForOutputPips"] = random.Next(0, 10000), ["AvgInputsForInputPips"] = random.Next(0, 100000), ["AvgInputsForOutputPips"] = random.Next(0, 100000), ["AvgOutputsForInputPips"] = random.Next(0, 100000), ["AvgOutputsForOutputPips"] = random.Next(0, 100000), ["AvgPriorityForInputPips"] = random.Next(0, 100), ["AvgPriorityForOutputPips"] = random.Next(0, 100), ["AvgWeightForInputPips"] = random.Next(0, 100), ["AvgWeightForOutputPips"] = random.Next(0, 100), ["AvgTagCountForInputPips"] = random.Next(0, 100), ["AvgTagCountForOutputPips"] = random.Next(0, 100), ["AvgSemaphoreCountForInputPips"] = random.Next(0, 100), ["AvgSemaphoreCountForOutputPips"] = random.Next(0, 100) } } }; var result = classifier.Classify(instance); if (result.Succeeded) { instances.Add(instance, result.Value); } switch (result.ReturnCode) { case ContentPlacementClassifierResult.ResultCode.ArtifactNotShared: ns++; break; case ContentPlacementClassifierResult.ResultCode.NoAlternativesForQueue: na++; break; default: break; } } classify.Stop(); s_logger.Info($"Classifier ({numInstances} instances, {ns} not shared, {na} without alternatives) done in {classify.ElapsedMilliseconds}ms (perInstanceAvg={(1.0 * classify.ElapsedMilliseconds) / (1.0 * numInstances)}ms)"); foreach (var kvp in instances) { var instance = kvp.Key; var predictedClasses = kvp.Value; var unique = new HashSet <string>(predictedClasses).Count; var real = predictedClasses.Count; if (unique != real) { Console.ForegroundColor = ConsoleColor.Yellow; } s_logger.Info($"queue={instance.QueueName}, count={real}, uniqueCount={unique}, alternatives=[{string.Join(",", predictedClasses)}]"); Console.ResetColor(); uniqueMachines.AddRange(predictedClasses); } foreach (var qq in classifier.AlternativesPerQueue()) { uniqueMachines.AddRange(qq.Value); } s_logger.Info($"totalMachinesAvailable={uniqueMachines.Count}, avg={(1.0 * uniqueMachines.Count) /(1.0 * classifier.AlternativesPerQueue().Count)} per queue"); }