Пример #1
0
        private static void ClassifyInstances(Args arguments)
        {
            // and a couple of checks here
            Contract.Requires(arguments.OutputDirectory != null, "You must specify an output directory");
            Contract.Requires(arguments.InputDirectory != null, "You must specify an input directory");
            Contract.Requires(Directory.Exists(arguments.OutputDirectory), "The output directory must exist");
            Contract.Requires(Directory.Exists(arguments.InputDirectory), "The input directory must exist");
            var sharedCount    = 0;
            var nonSharedCount = 0;

            try
            {
                // load the classifier first
                s_logger.Info("Loading classifier...");
                // work on the content store
                s_logger.Info($"Initializing store at [{arguments.OutputDirectory}]");
                var store     = new RocksDbContentPlacementPredictionStore(arguments.OutputDirectory, true);
                var opContext = new OperationContext(new Context(new LogWrapper(s_logger)));
                // init it
                var initialized = store.StartupAsync(opContext);
                initialized.Wait();
                // and check
                if (!initialized.Result)
                {
                    s_logger.Error($"Could not initialize RocksDbContentPlacementPredictionStore at [{arguments.OutputDirectory}]");
                }
                var classifier = new ContentPlacementClassifier(arguments.AppConfig.ClassifierConfiguration);
                // create the pipeline. The first step here is to parse the input files, and we can do this in parallel
                var buildArtifactParsingBlock = new TransformManyBlock <ParseBuildArtifactsInput, KeyValuePair <string, IReadOnlyList <ArtifactWithBuildMeta> > >(i =>
                {
                    var action = new ParseBuildArtifacts();
                    var result = action.PerformAction(i);
                    if (result.ExecutionStatus)
                    {
                        return(result.Result.ArtifactsByHash.ToList());
                    }
                    else
                    {
                        s_logger.Error(result.Exception, $"Error when parsing [{i.BuildArtifactsFile}]");
                        throw result.Exception;
                    }
                },
                                                                                                                                                                  new ExecutionDataflowBlockOptions()
                {
                    MaxDegreeOfParallelism = arguments.AppConfig.ConcurrencyConfig.MaxBuildParsingTasks
                }
                                                                                                                                                                  );
                // then, when we have one, we linearize it
                var linearizeBlock = new TransformBlock <KeyValuePair <string, IReadOnlyList <ArtifactWithBuildMeta> >, TimedActionResult <LinearizeArtifactsOutput> >(i =>
                {
                    var action = new LinearizeArtifacts();
                    return(action.PerformAction(new LinearizeArtifactsInput(i.Key, i.Value)));
                },
                                                                                                                                                                       new ExecutionDataflowBlockOptions()
                {
                    MaxDegreeOfParallelism = arguments.AppConfig.ConcurrencyConfig.MaxArtifactLinearizationTasks
                }
                                                                                                                                                                       );

                // and we classify them
                var classifyBlock = new ActionBlock <TimedActionResult <LinearizeArtifactsOutput> >(i =>
                {
                    // i have an ml instance here
                    if (i.ExecutionStatus)
                    {
                        var cpInstance = new ContentPlacementInstance()
                        {
                            Artifact  = i.Result.Linear.AsInstance(),  // using the default utility method
                            QueueName = i.Result.Linear.Queues.First() // the first here is enough, since its always one!
                        };
                        var result = classifier.Classify(cpInstance);
                        if (result.Succeeded)
                        {
                            var selectedMachines = result.Value;
                            foreach (var path in i.Result.Linear.ReportedPaths)
                            {
                                store.StoreResult(opContext, path, selectedMachines);
                                Interlocked.Add(ref sharedCount, 1);
                            }
                        }
                        else
                        {
                            Interlocked.Add(ref nonSharedCount, 1);
                        }
                    }
                },
                                                                                                    new ExecutionDataflowBlockOptions()
                {
                    MaxDegreeOfParallelism = arguments.AppConfig.ConcurrencyConfig.MaxArtifactClassificationTasks
                }
                                                                                                    );
                // link them
                var numParsingTasks = 0;
                buildArtifactParsingBlock.LinkTo(linearizeBlock, new DataflowLinkOptions {
                    PropagateCompletion = true
                });
                linearizeBlock.LinkTo(classifyBlock, new DataflowLinkOptions {
                    PropagateCompletion = true
                });
                // do now we can post to the initial queue
                foreach (var file in Directory.EnumerateFiles(arguments.InputDirectory, "*.json"))
                {
                    buildArtifactParsingBlock.Post(new ParseBuildArtifactsInput(file));
                    ++numParsingTasks;
                }
                s_logger.Info($"Posted {numParsingTasks} parsing tasks, processing");
                // now wait
                buildArtifactParsingBlock.Complete();
                classifyBlock.Completion.Wait();
                // and now we should snapshot
                var snapshotDir = Path.Combine(arguments.OutputDirectory, "Snap");
                Directory.CreateDirectory(snapshotDir);
                s_logger.Info($"Done, snapshoting to [{snapshotDir}]");
                var result = store.CreateSnapshot(opContext, snapshotDir);
                // done
            }
            finally
            {
                var total      = 1.0 * (sharedCount + nonSharedCount);
                var percentage = (1.0 * sharedCount) / total;
                s_logger.Info($"Stats: shared={sharedCount} ({percentage}), nonShared={nonSharedCount}, total={total}");
            }
        }
Пример #2
0
        private static void LinearizeDatabase(Args arguments)
        {
            // and a couple of checks here
            Contract.Requires(arguments.InputDirectory != null, "You must specify an input directory");
            Contract.Requires(Directory.Exists(arguments.InputDirectory), "The input directory must exist");
            var collectedArtifacts = new MultiValueDictionary <int, MLArtifact>();
            var currentTicks       = Environment.TickCount;
            var linearFile         = $"{Path.Combine(arguments.InputDirectory, $"{Convert.ToString(currentTicks)}.csv")}";
            var linearOutput       = TextWriter.Synchronized(new StreamWriter(linearFile));

            s_logger.Info($"Linearizing to [{linearFile}]");
            // write the headers
            MLArtifact.WriteColumnsToStream(linearOutput);
            // so now we are ready to linearize
            var linearizeBlock = new TransformBlock <LinearizeArtifactsInput, TimedActionResult <LinearizeArtifactsOutput> >(i =>
            {
                var action = new LinearizeArtifacts(linearOutput);
                return(action.PerformAction(i));
            },
                                                                                                                             new ExecutionDataflowBlockOptions()
            {
                MaxDegreeOfParallelism = arguments.AppConfig.ConcurrencyConfig.MaxArtifactLinearizationTasks
            }
                                                                                                                             );
            var collectLinearResultsBlock = new ActionBlock <TimedActionResult <LinearizeArtifactsOutput> >(i =>
            {
                if (i.ExecutionStatus)
                {
                    collectedArtifacts.Add(i.Result.NumQueues, i.Result.Linear);
                }
            },
                                                                                                            // enforce serial
                                                                                                            new ExecutionDataflowBlockOptions()
            {
                MaxDegreeOfParallelism = 1
            }
                                                                                                            );

            // connect
            linearizeBlock.LinkTo(collectLinearResultsBlock, new DataflowLinkOptions {
                PropagateCompletion = true
            });
            // and post the tasks
            var posted = 0;

            foreach (var hashDir in Directory.EnumerateDirectories(arguments.InputDirectory))
            {
                linearizeBlock.Post(new LinearizeArtifactsInput(hashDir));
                ++posted;
            }
            s_logger.Info($"Posted {posted} linearizing tasks, waiting...");
            linearizeBlock.Complete();
            // and wait
            collectLinearResultsBlock.Completion.Wait();
            // and close...
            linearOutput.Close();
            // now, scale to create the samples...
            s_logger.Info($"Creating {arguments.NumSamples} samples of size {arguments.SampleSize}");
            var scale = new Dictionary <int, int>();

            foreach (var entry in collectedArtifacts)
            {
                var queueCount = entry.Key;
                var entryCount = entry.Value.Count;
                var proportion = 1.0 * Math.BigMul(entryCount, arguments.SampleSize) / (1.0 * posted);
                scale[queueCount] = (int)Math.Ceiling(proportion);
            }
            // we have the scale, lets post tasks here
            var createSampleBlocks = new ActionBlock <SampleArtifactsInput>(i =>
            {
                var action = new SampleArtifacts();
                action.PerformAction(i);
            },
                                                                            // one per each sample
                                                                            new ExecutionDataflowBlockOptions()
            {
                MaxDegreeOfParallelism = arguments.NumSamples
            }
                                                                            );

            // post some tasks in here
            for (var i = 0; i < arguments.NumSamples; ++i)
            {
                createSampleBlocks.Post(new SampleArtifactsInput($"{Path.Combine(arguments.InputDirectory, $"{Convert.ToString(currentTicks)}-sample{i}.csv")}", scale, collectedArtifacts));
            }
            // and wait...
            createSampleBlocks.Complete();
            createSampleBlocks.Completion.Wait();
            // done...
        }